# Examine Machine Status Annotations

In [None]:
# Import packages

# data manipulation
import pandas as pd
import numpy as np

# create and remove directories
import os

# database connection
import psycopg2

# Steps
1. Fetch the data from an AWS PostgreSQL database
2. Convert the `machine_status` values from categorial to numeric 

## 1. Fetch pump sensor data from a PostgreSQL database

The data has been downloaded from Kaggle and stored in an AWS PostgreSQL database. Kaggle data source: https://www.kaggle.com/datasets/nphantawee/pump-sensor-data

We need to connect to the database and read the pump sensor data into a dataframe.

In [None]:
def get_all_as_df():
    conn = psycopg2.connect(
        database="predict-db",
        user="predict-db",
        password="failureislame",
        host="localhost",
    )

    GET_ALL_ROWS = "Select * from waterpump order by timestamp"

    try:
        with conn:
            # Pull our dataset into a pandas dataframe
            df = pd.read_sql_query(GET_ALL_ROWS, conn)
            df.set_index("timestamp", inplace=True)
    except (Exception, psycopg2.DatabaseError) as err:
        print(err)
    else:
        return df
    finally:
        conn.close()

In [None]:
df = get_all_as_df()

# first 5 rows
df.head()

## 2. Convert the `machine_status` values from categorical to numeric  

As you can see, the values in the `machine_status` column are categorical: 'BROKEN', 'NORMAL', 'RECOVERING'. We need to convert them to numeric values so that the model can interpret them. 

In [None]:
def machine_status_to_numeric(df):
    """Make 'machine_status" column numeric
    Numeric values are 0: 'NORMAL';, 1: 'BROKEN', 0.5: 'RECOVERING'
    :param df: DataFrame
    :type: Pandas DataFrame
    :return: none
    """
    status_values = [
        (df["machine_status"] == "NORMAL"),
        (df["machine_status"] == "BROKEN"),
        (df["machine_status"] == "RECOVERING"),
    ]
    numeric_status_values = [0, 1, 0.5]

    df["machine_status"] = np.select(status_values, numeric_status_values, default=0)

In [None]:
machine_status_to_numeric(df)

df.head()

Let's save the dataframe to a csv file so that we won't have to fetch it from the database or convert the machine_status column values from catergorical to numeric again in the following notebooks.

In [None]:
# create name for the directory
outdir = "./scratch"

# create name for the csv file
outname = "newkaggle.csv"

# check if the a directory named data exists, if not create it
if not os.path.exists(outdir):
    os.mkdir(outdir)

fullname = os.path.join(outdir, outname)

# save the dataframe to csv file
df.to_csv(fullname)

Now that we converted the categorical values in the `machine_status` column to numeric values, let's move on to our next notebook, `04-graph_annotations.ipynb`, where we will visualize the some of the data. 