# Examine Machine Status Annotations

In [1]:
# Import packages

# data manipulation
import pandas as pd
import numpy as np

# database connection
import psycopg2

# Steps
1. Fetch the data from an AWS PostgreSQL database
2. Convert the `machine_status` values from categorial to numeric 

## 1. Fetch pump sensor data from an AWS PostgreSQL database

The data has been downloaded from Kaggle and stored in an AWS PostgreSQL database. Kaggle data source: https://www.kaggle.com/datasets/nphantawee/pump-sensor-data

We need to connect to the database and read the pump sensor data into a dataframe.

In [2]:
def get_all_as_df():
    conn = psycopg2.connect(
    database="postgres", user='ad_postgres', password='AWSw0rksh0p', 
    host='db-anomalydetect-postgres.chanowujpkf4.us-east-1.rds.amazonaws.com')
    GET_ALL_ROWS = 'Select * from newkaggle order by timestamp'
    try:
        with conn:
            df = pd.read_sql_query(GET_ALL_ROWS, conn)
            df.set_index('timestamp', inplace=True)
    except (Exception, psycopg2.DatabaseError) as err:
        print(err)
    else:
        return df
    finally:
        conn.close()

In [3]:
df = get_all_as_df()

# first 5 rows
df.head()



Unnamed: 0_level_0,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-01 00:00:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
2018-04-01 00:01:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
2018-04-01 00:02:00,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,15.61777,15.01013,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,NORMAL
2018-04-01 00:03:00,2.460474,47.09201,53.1684,46.397568,628.125,76.98898,13.31742,16.24711,15.69734,15.08247,...,40.88541,39.0625,64.81481,51.21528,38.19444,155.9606,66.84028,240.4514,203.125,NORMAL
2018-04-01 00:04:00,2.445718,47.13541,53.2118,46.397568,636.4583,76.58897,13.35359,16.21094,15.69734,15.08247,...,41.40625,38.77315,65.10416,51.79398,38.77315,158.2755,66.55093,242.1875,201.3889,NORMAL


## 2. Convert the `machine_status` values from categorial to numeric 

As you can see, the values in the `machine_status` column are categorical: 'BROKEN', 'NORMAL', 'RECOVERING'. We need to convert them to numeric values so that the model can interpret them. 

In [4]:
def machine_status_to_numeric(df):
    """Make 'machine_status" column numeric
    Numeric values are 0: 'NORMAL';, 1: 'BROKEN', 0.5: 'RECOVERING'
    :param df: DataFrame
    :type: Pandas DataFrame
    :return: none
    """
    status_values = [(df['machine_status'] == 'NORMAL'), (df['machine_status'] == 'BROKEN'),
                     (df['machine_status'] == 'RECOVERING')]
    numeric_status_values = [0, 1, 0.5]

    df['machine_status'] = np.select(status_values, numeric_status_values, default=0)

In [5]:
machine_status_to_numeric(df)

df.head()

Unnamed: 0_level_0,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-01 00:00:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,0.0
2018-04-01 00:01:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,0.0
2018-04-01 00:02:00,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,15.61777,15.01013,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,0.0
2018-04-01 00:03:00,2.460474,47.09201,53.1684,46.397568,628.125,76.98898,13.31742,16.24711,15.69734,15.08247,...,40.88541,39.0625,64.81481,51.21528,38.19444,155.9606,66.84028,240.4514,203.125,0.0
2018-04-01 00:04:00,2.445718,47.13541,53.2118,46.397568,636.4583,76.58897,13.35359,16.21094,15.69734,15.08247,...,41.40625,38.77315,65.10416,51.79398,38.77315,158.2755,66.55093,242.1875,201.3889,0.0


Now that we converted the categorical values in the `machine_status` column to numeric values, let's move on to our next notebook, `02-graph_annotations.ipynb`, where we will visualize the some of the data. 