# Exploratory Data Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection
import sklearn.ensemble
import sklearn.preprocessing
import sklearn.metrics
import numpy as np
import datetime

In [2]:
# Load the dataset from local storage
# Replace 'path_to_file' with the actual path to your local CSV file
df = pd.read_csv("machine_downtime.csv")


In [3]:
df['Date'] = pd.to_datetime(df['Date'])
print(df.head())

        Date            Machine_ID Assembly_Line_No  Hydraulic_Pressure(bar)  \
0 2021-12-31  Makino-L1-Unit1-2013     Shopfloor-L1                    71.04   
1 2021-12-31  Makino-L1-Unit1-2013     Shopfloor-L1                   125.33   
2 2021-12-31  Makino-L3-Unit1-2015     Shopfloor-L3                    71.12   
3 2022-05-31  Makino-L2-Unit1-2015     Shopfloor-L2                   139.34   
4 2022-03-31  Makino-L1-Unit1-2013     Shopfloor-L1                    60.51   

   Coolant_Pressure(bar)  Air_System_Pressure(bar)  Coolant_Temperature  \
0               6.933725                  6.284965                 25.6   
1               4.936892                  6.196733                 35.3   
2               6.839413                  6.655448                 13.1   
3               4.574382                  6.560394                 24.4   
4               6.893182                  6.141238                  4.1   

   Hydraulic_Oil_Temperature(?C)  Spindle_Bearing_Temperature(?C)  \

  df['Date'] = pd.to_datetime(df['Date'])


In [4]:
# Analyze value counts
df['Machine_ID'].value_counts()
df['Assembly_Line_No'].value_counts()

Assembly_Line_No
Shopfloor-L1    874
Shopfloor-L3    818
Shopfloor-L2    808
Name: count, dtype: int64

In [5]:
# Cross-tabulation
print(pd.crosstab(df['Machine_ID'], df['Assembly_Line_No']))

Assembly_Line_No      Shopfloor-L1  Shopfloor-L2  Shopfloor-L3
Machine_ID                                                    
Makino-L1-Unit1-2013           874             0             0
Makino-L2-Unit1-2015             0           808             0
Makino-L3-Unit1-2015             0             0           818


# Model Training and Data Preprocessing

In [6]:
# Discard the 'Assembly_Line_No' column
df = df.drop(['Assembly_Line_No'], axis=1)

In [7]:
# One-hot encode the 'Machine_ID' column
one_hot_encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=False)
one_hot_encoder.set_output(transform='pandas')

In [8]:
df = pd.concat(
  [
    df.drop(['Machine_ID'], axis=1),
    one_hot_encoder.fit_transform(df[['Machine_ID']])
  ],
  axis=1
)
print(df.tail())

           Date  Hydraulic_Pressure(bar)  Coolant_Pressure(bar)  \
2495 2022-02-01               112.715506               5.220885   
2496 2022-02-01               103.086653               5.211886   
2497 2022-02-01               118.643165               5.212991   
2498 2022-02-01               145.855859               5.207777   
2499 2022-02-01                96.690000               5.936610   

      Air_System_Pressure(bar)  Coolant_Temperature  \
2495                  6.196610                 22.3   
2496                  7.074653                 11.9   
2497                  6.530049                  4.5   
2498                  6.402655                 12.2   
2499                  7.109355                 29.8   

      Hydraulic_Oil_Temperature(?C)  Spindle_Bearing_Temperature(?C)  \
2495                           48.8                             37.2   
2496                           48.3                             31.5   
2497                           49.9               

In [9]:
# Encode the date as days since manufacture date
df['Manufacture_Date'] = np.select([
    df['Machine_ID_Makino-L1-Unit1-2013'] == 1
], [
    pd.to_datetime("1/1/2013")
], default=pd.to_datetime("1/1/2015"))

df['Date'] = (df['Date'] - df['Manufacture_Date']).dt.days

In [10]:
# Drop the 'Manufacture_Date' column
df = df.drop('Manufacture_Date', axis=1)

In [11]:
# Prepare for training
X, y = df.drop(['Downtime'], axis=1), df['Downtime']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42, stratify=y)

In [12]:
# Define and train the classifier
clf = sklearn.pipeline.make_pipeline(
    sklearn.preprocessing.StandardScaler(),
    sklearn.ensemble.RandomForestClassifier(max_depth=6, random_state=0)
)

clf.fit(X_train, y_train)

In [13]:
# Evaluate the model
print("Test Score:", clf.score(X_test, y_test))
print("Confusion Matrix:")
print(sklearn.metrics.confusion_matrix(y_test, clf.predict(X_test)))

Test Score: 0.9792
Confusion Matrix:
[[309   7]
 [  6 303]]
