In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
# also import for Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold

In [64]:
df = pd.read_csv('results_new2.csv')


In [65]:
# print all columns in dataframe
print(df.columns)

Index(['DataLabel', 'TP', 'TE1', 'TE2', 'TE3', 'TE4', 'Program Number',
       'Batch Status', 'Min_ster_Temp', 'Max_ster_Temp', 'Overall Avg TE1',
       'Overall Avg TE2', 'Overall Avg TE3', 'Overall Avg TE4',
       'Overall Avg TP', 'Overall Var TE1', 'Overall Var TE2',
       'Overall Var TE3', 'Overall Var TE4', 'Overall Var TP', 'Time'],
      dtype='object')


In [66]:
# make a new dataframe with only the columns we want which are  'Overall Var TE1', 'Overall Var TE2','Overall Var TE3', 'Overall Var TE4', 'Overall Var TP', 'Batch Status', 'Min_ster_Temp', 'Max_ster_Temp'


# drop the Time column from dataframe
df = df.drop('Time', axis=1)

In [67]:
df.head()

Unnamed: 0,DataLabel,TP,TE1,TE2,TE3,TE4,Program Number,Batch Status,Min_ster_Temp,Max_ster_Temp,Overall Avg TE1,Overall Avg TE2,Overall Avg TE3,Overall Avg TE4,Overall Avg TP,Overall Var TE1,Overall Var TE2,Overall Var TE3,Overall Var TE4,Overall Var TP
0,1,2.11,0.0,122.1,122.2,122.1,8,FAILED,122.1,122.2,47.12069,114.396238,115.152978,114.968966,1.547931,2803.869759,131.966338,112.167656,115.738436,0.657171
1,2,2.10918,121.744262,121.963115,122.161475,122.013934,8,OK,121.2,122.2,92.999427,112.804585,113.555301,113.648711,1.100487,874.492528,127.465898,114.342824,109.503023,0.728475
2,3,2.108934,121.779508,122.040164,122.269672,122.140164,3,OK,121.2,122.4,90.738629,113.893458,114.797819,114.575701,1.074486,925.593441,86.884676,77.099151,81.663595,0.849215
3,4,2.109024,121.760163,121.968293,122.203252,122.082927,11,OK,121.2,122.3,88.138739,113.45045,114.356156,114.064565,1.073874,967.878284,89.466001,81.164036,86.76368,0.824727
4,5,2.109754,121.595902,121.831148,122.027869,121.883607,10,OK,121.2,122.1,92.804469,102.759497,102.860056,103.077933,1.959721,879.638523,473.956702,492.689689,450.135114,0.143143


In [68]:

# do label encoding, apply 1 for OK and apply 0 for all others in Batch status, and later delete the batch status column
df['Batch Status_encoded'] = df['Batch Status'].apply(lambda x: 1 if x == 'OK' else 0)

In [69]:
# replace batch status with batch status encoded
df = df.drop(columns=['Batch Status'])
df = df.rename(columns={'Batch Status_encoded': 'Batch Status'})

In [70]:
def df_to_vectors(df):
    vectors = []
    for i in range(len(df) - 20):
        vector = df.iloc[i:i+20].values.flatten()
        vectors.append(vector)
    return vectors

vectors = df_to_vectors(df)


In [71]:

vectors = np.array(vectors)
vectors.shape

(1840, 400)

In [72]:
# Output of each batch status
batch_statuses = df['Batch Status'].values[21:]
# append the last row as a 1 
batch_statuses = np.append(batch_statuses, 1)

batch_statuses.shape

(1840,)

In [73]:
# check batch statuses value counts
df['Batch Status'].value_counts()


1    1507
0     353
Name: Batch Status, dtype: int64

In [74]:
# Convert dataframe to feature vectors and target variable
X = vectors
y = batch_statuses

In [75]:
# Separate the data into failed and passed batches
failed_indices = np.where(y == 0)[0]
passed_indices = np.where(y == 1)[0]
# Split the failed batches into train and test sets
failed_indices_train, failed_indices_test = train_test_split(failed_indices, test_size=0.2)
# do similar split for passed indices
passed_indices_train, passed_indices_test = train_test_split(passed_indices, test_size=0.2)
# combine train and test indices separately, including passed and failed
train_indices = np.concatenate((failed_indices_train, passed_indices_train))
test_indices = np.concatenate((failed_indices_test, passed_indices_test))


# # Combine the indices of train and test sets with passed indices
# train_indices = np.concatenate((failed_indices_train, passed_indices))
# test_indices = np.concatenate((failed_indices_test, passed_indices[:4*len(failed_indices_test)]))

# Shuffle the indices to ensure randomness
np.random.shuffle(train_indices)
np.random.shuffle(test_indices)

# Use the indices to get the corresponding data
X_train, y_train = X[train_indices], y[train_indices]
X_test, y_test = X[test_indices], y[test_indices]


In [76]:
# print y_train value counts
print('y_train value counts')
print(pd.Series(y_train).value_counts())


y_train value counts
1    1195
0     276
dtype: int64


In [77]:

# Identify minority class label
minority_class_label = 0  # Assuming failed batches are labeled as 0

# Find indices of minority class samples in the training set
minority_indices_train = np.where(y_train == minority_class_label)[0]

# Extract minority class samples in the training set
X_minority_train = X_train[minority_indices_train]
y_minority_train = y_train[minority_indices_train]


In [78]:
# Apply SMOTE to generate synthetic samples for minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [79]:
# Display the shape of the original and resampled train data
print('Original train data shape:', X_train.shape)
print('Resampled train data shape:', X_train_resampled.shape)

# Display the number of samples in each class before and after resampling
print('Original class distribution:')
print( pd.Series(y_train).value_counts())
print('Resampled class distribution:')
print(pd.Series(y_train_resampled).value_counts())


Original train data shape: (1471, 400)
Resampled train data shape: (2390, 400)
Original class distribution:
1    1195
0     276
dtype: int64
Resampled class distribution:
1    1195
0    1195
dtype: int64


In [80]:

# Print the shape of the resampled training data
print("Shape of X_train_resampled:", X_train_resampled.shape)
print("Shape of y_train_resampled:", y_train_resampled.shape)

# Print the shape of all datasets
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train_resampled: (2390, 400)
Shape of y_train_resampled: (2390,)
Shape of X_train: (1471, 400)
Shape of y_train: (1471,)
Shape of X_test: (369, 400)
Shape of y_test: (369,)


In [81]:

# Initialize and train Decision Tree classifier using resampled training data
clf = DecisionTreeClassifier()
clf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_dt = clf.predict(X_test)
# prediction on train set
y_pred_train = clf.predict(X_train_resampled)

# Calculate accuracy on test set
accuracy_test = accuracy_score(y_test, y_pred_dt)
print("Accuracy on test set:", accuracy_test)

# Calculate accuracy on resampled training set
accuracy_train = accuracy_score(y_train_resampled, y_pred_train)
print("Accuracy on resampled training set:", accuracy_train)

# Calculate mean squared error on test set
mse = mean_squared_error(y_test, y_pred_dt)
print("Mean Squared Error on test set:", mse)

# Generate classification report on test set
class_report = classification_report(y_test, y_pred_dt)
print("Classification Report on test set:\n", class_report)

# Generate confusion matrix on test set
conf_matrix = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix on test set:\n", conf_matrix)

Accuracy on test set: 0.7046070460704607
Accuracy on resampled training set: 1.0
Mean Squared Error on test set: 0.2953929539295393
Classification Report on test set:
               precision    recall  f1-score   support

           0       0.27      0.33      0.30        70
           1       0.83      0.79      0.81       299

    accuracy                           0.70       369
   macro avg       0.55      0.56      0.55       369
weighted avg       0.73      0.70      0.72       369

Confusion Matrix on test set:
 [[ 23  47]
 [ 62 237]]


In [82]:

# Initialize and train Random Forest classifier using resampled training data
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_rf = rf_clf.predict(X_test)
y_pred_train_rf = rf_clf.predict(X_train_resampled)

# Calculate accuracy on test set
accuracy_test_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy on test set (Random Forest):", accuracy_test_rf)

# Calculate accuracy on resampled training set
accuracy_train_rf = accuracy_score(y_train_resampled, y_pred_train_rf)
print("Accuracy on resampled training set (Random Forest):", accuracy_train_rf)

# Calculate mean squared error on test set
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error on test set (Random Forest):", mse_rf)

# Generate classification report on test set
class_report_rf = classification_report(y_test, y_pred_rf)
print("Classification Report on test set (Random Forest):\n", class_report_rf)

# Generate confusion matrix on test set
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix on test set (Random Forest):\n", conf_matrix_rf)

Accuracy on test set (Random Forest): 0.7994579945799458
Accuracy on resampled training set (Random Forest): 1.0
Mean Squared Error on test set (Random Forest): 0.2005420054200542
Classification Report on test set (Random Forest):
               precision    recall  f1-score   support

           0       0.44      0.21      0.29        70
           1       0.84      0.94      0.88       299

    accuracy                           0.80       369
   macro avg       0.64      0.58      0.59       369
weighted avg       0.76      0.80      0.77       369

Confusion Matrix on test set (Random Forest):
 [[ 15  55]
 [ 19 280]]


In [83]:

# Initialize and train Logistic Regression classifier using resampled training data
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)
y_pred_train_log_reg = log_reg.predict(X_train_resampled)

# Calculate accuracy on test set
accuracy_test_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("Accuracy on test set (Logistic Regression):", accuracy_test_log_reg)

# Calculate accuracy on resampled training set
accuracy_train_log_reg = accuracy_score(y_train_resampled, y_pred_train_log_reg)
print("Accuracy on resampled training set (Logistic Regression):", accuracy_train_log_reg)

# Calculate mean squared error on test set
mse_log_reg = mean_squared_error(y_test, y_pred_log_reg)
print("Mean Squared Error on test set (Logistic Regression):", mse_log_reg)

# Generate classification report on test set
class_report_log_reg = classification_report(y_test, y_pred_log_reg)
print("Classification Report on test set (Logistic Regression):\n", class_report_log_reg)

# Generate confusion matrix on test set
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix on test set (Logistic Regression):\n", conf_matrix_log_reg)

Accuracy on test set (Logistic Regression): 0.6368563685636857
Accuracy on resampled training set (Logistic Regression): 0.7753138075313808
Mean Squared Error on test set (Logistic Regression): 0.36314363143631434
Classification Report on test set (Logistic Regression):
               precision    recall  f1-score   support

           0       0.24      0.43      0.31        70
           1       0.84      0.69      0.75       299

    accuracy                           0.64       369
   macro avg       0.54      0.56      0.53       369
weighted avg       0.72      0.64      0.67       369

Confusion Matrix on test set (Logistic Regression):
 [[ 30  40]
 [ 94 205]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
