In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
# also import for Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold

In [2]:
df = pd.read_csv('results_new2.csv')


In [3]:
df.head()

# 

Unnamed: 0,DataLabel,TP,TE1,TE2,TE3,TE4,Program Number,Batch Status,Min_ster_Temp,Max_ster_Temp,...,Overall Avg TE2,Overall Avg TE3,Overall Avg TE4,Overall Avg TP,Overall Var TE1,Overall Var TE2,Overall Var TE3,Overall Var TE4,Overall Var TP,Time
0,1,2.11,0.0,122.1,122.2,122.1,8,FAILED,122.1,122.2,...,114.396238,115.152978,114.968966,1.547931,2803.869759,131.966338,112.167656,115.738436,0.657171,2019-01-09 09:42:00
1,2,2.10918,121.744262,121.963115,122.161475,122.013934,8,OK,121.2,122.2,...,112.804585,113.555301,113.648711,1.100487,874.492528,127.465898,114.342824,109.503023,0.728475,2019-01-09 11:30:00
2,3,2.108934,121.779508,122.040164,122.269672,122.140164,3,OK,121.2,122.4,...,113.893458,114.797819,114.575701,1.074486,925.593441,86.884676,77.099151,81.663595,0.849215,2019-01-09 12:32:00
3,4,2.109024,121.760163,121.968293,122.203252,122.082927,11,OK,121.2,122.3,...,113.45045,114.356156,114.064565,1.073874,967.878284,89.466001,81.164036,86.76368,0.824727,2019-01-09 14:00:00
4,5,2.109754,121.595902,121.831148,122.027869,121.883607,10,OK,121.2,122.1,...,102.759497,102.860056,103.077933,1.959721,879.638523,473.956702,492.689689,450.135114,0.143143,2019-01-09 16:10:00


In [4]:
# convert the date time to pandas datetime

df['Time'] = pd.to_datetime(df['Time'])
# drop time column
df.drop('Time', axis=1, inplace=True)


In [5]:
df.head()

Unnamed: 0,DataLabel,TP,TE1,TE2,TE3,TE4,Program Number,Batch Status,Min_ster_Temp,Max_ster_Temp,Overall Avg TE1,Overall Avg TE2,Overall Avg TE3,Overall Avg TE4,Overall Avg TP,Overall Var TE1,Overall Var TE2,Overall Var TE3,Overall Var TE4,Overall Var TP
0,1,2.11,0.0,122.1,122.2,122.1,8,FAILED,122.1,122.2,47.12069,114.396238,115.152978,114.968966,1.547931,2803.869759,131.966338,112.167656,115.738436,0.657171
1,2,2.10918,121.744262,121.963115,122.161475,122.013934,8,OK,121.2,122.2,92.999427,112.804585,113.555301,113.648711,1.100487,874.492528,127.465898,114.342824,109.503023,0.728475
2,3,2.108934,121.779508,122.040164,122.269672,122.140164,3,OK,121.2,122.4,90.738629,113.893458,114.797819,114.575701,1.074486,925.593441,86.884676,77.099151,81.663595,0.849215
3,4,2.109024,121.760163,121.968293,122.203252,122.082927,11,OK,121.2,122.3,88.138739,113.45045,114.356156,114.064565,1.073874,967.878284,89.466001,81.164036,86.76368,0.824727
4,5,2.109754,121.595902,121.831148,122.027869,121.883607,10,OK,121.2,122.1,92.804469,102.759497,102.860056,103.077933,1.959721,879.638523,473.956702,492.689689,450.135114,0.143143


In [6]:

# do label encoding, apply 1 for OK and apply 0 for all others in Batch status, and later delete the batch status column
df['Batch Status_encoded'] = df['Batch Status'].apply(lambda x: 1 if x == 'OK' else 0)

In [7]:
# replace batch status with batch status encoded
df = df.drop(columns=['Batch Status'])
df = df.rename(columns={'Batch Status_encoded': 'Batch Status'})

In [8]:

df.drop('DataLabel',axis=1,inplace=True)

def df_to_vectors(df):
    vectors = []
    for i in range(len(df) - 15):
        vector = df.iloc[i:i+15].values.flatten()
        vectors.append(vector)
    return vectors

vectors = df_to_vectors(df)



In [9]:

vectors = np.array(vectors)
vectors.shape

(1845, 285)

In [10]:
# Output of each batch status
batch_statuses = df['Batch Status'].values[16:]
# append the last row as a 1 
batch_statuses = np.append(batch_statuses, 1)

batch_statuses.shape

(1845,)

In [11]:
# check batch statuses value counts
df['Batch Status'].value_counts()


1    1507
0     353
Name: Batch Status, dtype: int64

In [12]:
# Convert dataframe to feature vectors and target variable
X = vectors
y = batch_statuses

In [13]:
# Separate the data into failed and passed batches
failed_indices = np.where(y == 0)[0]
passed_indices = np.where(y == 1)[0]
# Split the failed batches into train and test sets
failed_indices_train, failed_indices_test = train_test_split(failed_indices, test_size=0.3)
# do similar split for passed indices
passed_indices_train, passed_indices_test = train_test_split(passed_indices, test_size=0.3)
# combine train and test indices separately, including passed and failed
train_indices = np.concatenate((failed_indices_train, passed_indices_train))
test_indices = np.concatenate((failed_indices_test, passed_indices_test))


# # Combine the indices of train and test sets with passed indices
# train_indices = np.concatenate((failed_indices_train, passed_indices))
# test_indices = np.concatenate((failed_indices_test, passed_indices[:4*len(failed_indices_test)]))

# Shuffle the indices to ensure randomness
np.random.shuffle(train_indices)
np.random.shuffle(test_indices)

# Use the indices to get the corresponding data
X_train, y_train = X[train_indices], y[train_indices]
X_test, y_test = X[test_indices], y[test_indices]


In [14]:
# print y_train value counts
print('y_train value counts')
print(pd.Series(y_train).value_counts())


y_train value counts
1    1047
0     243
dtype: int64


In [15]:

# Identify minority class label
minority_class_label = 0  # Assuming failed batches are labeled as 0

# Find indices of minority class samples in the training set
minority_indices_train = np.where(y_train == minority_class_label)[0]

# Extract minority class samples in the training set
X_minority_train = X_train[minority_indices_train]
y_minority_train = y_train[minority_indices_train]


In [16]:
# Apply SMOTE to generate synthetic samples for minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [17]:
# Display the shape of the original and resampled train data
print('Original train data shape:', X_train.shape)
print('Resampled train data shape:', X_train_resampled.shape)

# Display the number of samples in each class before and after resampling
print('Original class distribution:')
print( pd.Series(y_train).value_counts())
print('Resampled class distribution:')
print(pd.Series(y_train_resampled).value_counts())


Original train data shape: (1290, 285)
Resampled train data shape: (2094, 285)
Original class distribution:
1    1047
0     243
dtype: int64
Resampled class distribution:
1    1047
0    1047
dtype: int64


In [18]:

# Print the shape of the resampled training data
print("Shape of X_train_resampled:", X_train_resampled.shape)
print("Shape of y_train_resampled:", y_train_resampled.shape)

# Print the shape of all datasets
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train_resampled: (2094, 285)
Shape of y_train_resampled: (2094,)
Shape of X_train: (1290, 285)
Shape of y_train: (1290,)
Shape of X_test: (555, 285)
Shape of y_test: (555,)


In [19]:

# Initialize and train Decision Tree classifier using resampled training data
clf = DecisionTreeClassifier()
clf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_dt = clf.predict(X_test)
# prediction on train set
y_pred_train = clf.predict(X_train_resampled)

# Calculate accuracy on test set
accuracy_test = accuracy_score(y_test, y_pred_dt)
print("Accuracy on test set:", accuracy_test)

# Calculate accuracy on resampled training set
accuracy_train = accuracy_score(y_train_resampled, y_pred_train)
print("Accuracy on resampled training set:", accuracy_train)

# Calculate mean squared error on test set
mse = mean_squared_error(y_test, y_pred_dt)
print("Mean Squared Error on test set:", mse)

# Generate classification report on test set
class_report = classification_report(y_test, y_pred_dt)
print("Classification Report on test set:\n", class_report)

# Generate confusion matrix on test set
conf_matrix = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix on test set:\n", conf_matrix)

Accuracy on test set: 0.6846846846846847
Accuracy on resampled training set: 1.0
Mean Squared Error on test set: 0.3153153153153153
Classification Report on test set:
               precision    recall  f1-score   support

           0       0.25      0.34      0.29       105
           1       0.83      0.76      0.80       450

    accuracy                           0.68       555
   macro avg       0.54      0.55      0.54       555
weighted avg       0.72      0.68      0.70       555

Confusion Matrix on test set:
 [[ 36  69]
 [106 344]]


In [20]:

# Initialize and train Random Forest classifier using resampled training data
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_rf = rf_clf.predict(X_test)
y_pred_train_rf = rf_clf.predict(X_train_resampled)

# Calculate accuracy on test set
accuracy_test_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy on test set (Random Forest):", accuracy_test_rf)

# Calculate accuracy on resampled training set
accuracy_train_rf = accuracy_score(y_train_resampled, y_pred_train_rf)
print("Accuracy on resampled training set (Random Forest):", accuracy_train_rf)

# Calculate mean squared error on test set
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error on test set (Random Forest):", mse_rf)

# Generate classification report on test set
class_report_rf = classification_report(y_test, y_pred_rf)
print("Classification Report on test set (Random Forest):\n", class_report_rf)

# Generate confusion matrix on test set
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix on test set (Random Forest):\n", conf_matrix_rf)

Accuracy on test set (Random Forest): 0.7927927927927928
Accuracy on resampled training set (Random Forest): 1.0
Mean Squared Error on test set (Random Forest): 0.2072072072072072
Classification Report on test set (Random Forest):
               precision    recall  f1-score   support

           0       0.41      0.23      0.29       105
           1       0.84      0.92      0.88       450

    accuracy                           0.79       555
   macro avg       0.63      0.58      0.59       555
weighted avg       0.76      0.79      0.77       555

Confusion Matrix on test set (Random Forest):
 [[ 24  81]
 [ 34 416]]


In [21]:

# Initialize and train Logistic Regression classifier using resampled training data
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)
y_pred_train_log_reg = log_reg.predict(X_train_resampled)

# Calculate accuracy on test set
accuracy_test_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("Accuracy on test set (Logistic Regression):", accuracy_test_log_reg)

# Calculate accuracy on resampled training set
accuracy_train_log_reg = accuracy_score(y_train_resampled, y_pred_train_log_reg)
print("Accuracy on resampled training set (Logistic Regression):", accuracy_train_log_reg)

# Calculate mean squared error on test set
mse_log_reg = mean_squared_error(y_test, y_pred_log_reg)
print("Mean Squared Error on test set (Logistic Regression):", mse_log_reg)

# Generate classification report on test set
class_report_log_reg = classification_report(y_test, y_pred_log_reg)
print("Classification Report on test set (Logistic Regression):\n", class_report_log_reg)

# Generate confusion matrix on test set
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix on test set (Logistic Regression):\n", conf_matrix_log_reg)

Accuracy on test set (Logistic Regression): 0.6558558558558558
Accuracy on resampled training set (Logistic Regression): 0.7956064947468959
Mean Squared Error on test set (Logistic Regression): 0.3441441441441441
Classification Report on test set (Logistic Regression):
               precision    recall  f1-score   support

           0       0.27      0.49      0.35       105
           1       0.85      0.70      0.77       450

    accuracy                           0.66       555
   macro avg       0.56      0.59      0.56       555
weighted avg       0.74      0.66      0.69       555

Confusion Matrix on test set (Logistic Regression):
 [[ 51  54]
 [137 313]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
def df_to_vectors(df, vector_size):
    vectors = []
    for i in range(len(df) - vector_size):
        vector = df.iloc[i:i+vector_size].values.flatten()
        vectors.append(vector)
    return vectors

best_accuracy = 0
best_vector_size = 0

# Try different vector sizes from 5 to 40
for vector_size in range(5, 41):
    vectors = df_to_vectors(df, vector_size)
    
    batch_statuses = df['Batch Status'].values[vector_size + 1:]
    # append the last row as a 1 
    batch_statuses = np.append(batch_statuses, 1)
    # Split the data into features (X) and target (y)
    X = vectors
    y = batch_statuses  # Target is the last column

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Random Forest classifier
    rf_clf = RandomForestClassifier()
    rf_clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred_rf = rf_clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_rf)

    # Update best accuracy and best vector size if current accuracy is higher
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_vector_size = vector_size

print("Best Vector Size:", best_vector_size)
print("Best Accuracy:", best_accuracy)


Best Vector Size: 7
Best Accuracy: 0.8517520215633423


In [23]:
# print the confusion matrix for the best vector size
vectors = df_to_vectors(df, best_vector_size)

batch_statuses = df['Batch Status'].values[best_vector_size + 1:]
# append the last row as a 1
batch_statuses = np.append(batch_statuses, 1)
# Split the data into features (X) and target (y)
X = vectors
y = batch_statuses  # Target is the last column

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_clf.predict(X_test)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:\n", conf_matrix)

# print accuracy
accuracy = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy)




Confusion Matrix:
 [[ 11  49]
 [  7 304]]
Accuracy: 0.8490566037735849


In [24]:
from sklearn.tree import DecisionTreeClassifier

def df_to_vectors(df, vector_size):
    vectors = []
    for i in range(len(df) - vector_size):
        vector = df.iloc[i:i+vector_size].values.flatten()
        vectors.append(vector)
    return vectors

best_accuracy = 0
best_vector_size = 0

# Try different vector sizes from 5 to 40
for vector_size in range(5, 41):
    vectors = df_to_vectors(df, vector_size)
    
    batch_statuses = df['Batch Status'].values[vector_size + 1:]
    # append the last row as a 1 
    batch_statuses = np.append(batch_statuses, 1)
    # Split the data into features (X) and target (y)
    X = vectors
    y = batch_statuses  # Target is the last column

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Decision Tree classifier
    dt_clf = DecisionTreeClassifier()
    dt_clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred_dt = dt_clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_dt)

    # Update best accuracy and best vector size if current accuracy is higher
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_vector_size = vector_size

print("Best Vector Size:", best_vector_size)
print("Best Accuracy:", best_accuracy)


Best Vector Size: 37
Best Accuracy: 0.7561643835616438


In [25]:
# print the confusion matrix for the best vector size
vectors = df_to_vectors(df, best_vector_size)

batch_statuses = df['Batch Status'].values[best_vector_size + 1:]
# append the last row as a 1
batch_statuses = np.append(batch_statuses, 1)
# Split the data into features (X) and target (y)
X = vectors
y = batch_statuses  # Target is the last column

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train decision tree classifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_clf.predict(X_test)


# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix:\n", conf_matrix)

# print accuracy
accuracy = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy)


Confusion Matrix:
 [[ 20  51]
 [ 48 246]]
Accuracy: 0.7287671232876712


In [26]:
from sklearn.metrics import precision_score, recall_score

best_accuracy = 0
best_precision = 0
best_recall = 0
best_vector_size = 0

# Try different vector sizes from 5 to 40
for vector_size in range(5, 41):
    vectors = df_to_vectors(df, vector_size)
    
    batch_statuses = df['Batch Status'].values[vector_size + 1:]
    # append the last row as a 1 
    batch_statuses = np.append(batch_statuses, 1)
    # Split the data into features (X) and target (y)
    X = vectors
    y = batch_statuses  # Target is the last column

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Logistic Regression classifier
    log_reg = LogisticRegression(max_iter=500)
    log_reg.fit(X_train, y_train)

    # Predict on the test set
    y_pred_log_reg = log_reg.predict(X_test)
    # use recall score for recall
    recall = recall_score(y_test, y_pred_log_reg)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_log_reg)
    
    # Calculate precision
    conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
    # find precision and recall
    precision = conf_matrix_log_reg[0,0]/(conf_matrix_log_reg[0,1]+conf_matrix_log_reg[0,0])
    

    # Update best accuracy and best vector size if current accuracy is higher
    if  precision > best_precision:
        best_accuracy = accuracy
        best_precision = precision
        best_vector_size = vector_size

print("Best Vector Size:", best_vector_size)
print("Best Accuracy:", best_accuracy)
print("Best Precision:", best_precision)
print("Best Recall:", best_recall)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Vector Size: 32
Best Accuracy: 0.76775956284153
Best Precision: 0.3188405797101449
Best Recall: 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Best Vector Size: 7
# Best Accuracy: 0.8436657681940701
# for logistic regression
# give the confusion matrix for vector size 7
vectors = df_to_vectors(df, 32)
batch_statuses = df['Batch Status'].values[33:]
# append the last row as a 1
batch_statuses = np.append(batch_statuses, 1)
X = vectors
y = batch_statuses
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix on test set (Logistic Regression):\n", conf_matrix_log_reg)



print("Precision:", precision)

Confusion Matrix on test set (Logistic Regression):
 [[ 22  47]
 [ 38 259]]
Precision: 0.2835820895522388


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# Function to calculate precision from confusion matrix
def calculate_precision(conf_matrix):
    return conf_matrix[0, 0] / (conf_matrix[0, 1] + conf_matrix[0, 0])

# Initialize best metrics
best_accuracy = 0
best_precision = 0
best_recall = 0
best_vector_size = 0

# Try different vector sizes from 5 to 40
for vector_size in range(5, 41):
    vectors = df_to_vectors(df, vector_size)
    
    batch_statuses = df['Batch Status'].values[vector_size + 1:]
    # Append the last row as a 1 
    batch_statuses = np.append(batch_statuses, 1)
    # Split the data into features (X) and target (y)
    X = vectors
    y = batch_statuses  # Target is the last column

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Decision Tree classifier
    dt_clf = DecisionTreeClassifier()
    dt_clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred_dt = dt_clf.predict(X_test)

    # Calculate confusion matrix
    conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)

    # Calculate accuracy, precision, and recall
    accuracy = accuracy_score(y_test, y_pred_dt)
    precision = calculate_precision(conf_matrix_dt)
    recall = recall_score(y_test, y_pred_dt)

    # Update best metrics and vector size if current precision is higher
    if precision > best_precision:
        best_accuracy = accuracy
        best_precision = precision
        best_recall = recall
        best_vector_size = vector_size

print("Decision Tree:")
print("Best Vector Size:", best_vector_size)
print("Best Accuracy:", best_accuracy)
print("Best Precision:", best_precision)
print("Best Recall:", best_recall)


Decision Tree:
Best Vector Size: 5
Best Accuracy: 0.7412398921832885
Best Precision: 0.42028985507246375
Best Recall: 0.8145695364238411


In [31]:
# print the confusion matrix for the best vector size
vectors = df_to_vectors(df, best_vector_size)

batch_statuses = df['Batch Status'].values[best_vector_size + 1:]
# append the last row as a 1
batch_statuses = np.append(batch_statuses, 1)
# Split the data into features (X) and target (y)
X = vectors
y = batch_statuses  # Target is the last column

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train decision tree classifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_clf.predict(X_test)


# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix:\n", conf_matrix)

# print accuracy
accuracy = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy)

Confusion Matrix:
 [[ 29  40]
 [ 60 242]]
Accuracy: 0.7304582210242587


In [47]:
# Function to calculate precision from confusion matrix
def calculate_precision(conf_matrix):
    return conf_matrix[0, 0] / (conf_matrix[0, 1] + conf_matrix[0, 0])

# Initialize best metrics and best confusion matrix
best_accuracy = 0
best_precision = 0
best_recall = 0
best_vector_size = 0
best_conf_matrix = None

# Try different vector sizes from 5 to 40
for vector_size in range(5, 41):
    vectors = df_to_vectors(df, vector_size)
    
    batch_statuses = df['Batch Status'].values[vector_size + 1:]
    # Append the last row as a 1 
    batch_statuses = np.append(batch_statuses, 1)
    # Split the data into features (X) and target (y)
    X = vectors
    y = batch_statuses  # Target is the last column

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Random Forest classifier
    rf_clf = RandomForestClassifier()
    rf_clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred_rf = rf_clf.predict(X_test)

    # Calculate confusion matrix
    conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
    
    # Calculate accuracy, precision, and recall precis
    accuracy = accuracy_score(y_test, y_pred_rf)
    precis = calculate_precision(conf_matrix_rf)
    recall = recall_score(y_test, y_pred_rf)

    # Update best metrics and vector size if current precision is higher
    if precis > best_precision:
        best_accuracy = accuracy
        best_precision = precis
        best_recall = recall
        best_vector_size = vector_size
        best_conf_matrix = conf_matrix_rf

print("Random Forest:")
print("Best Vector Size:", best_vector_size)
print("Best Accuracy:", best_accuracy)
print("Best Precision:", best_precision)
print("Best Recall:", best_recall)
print("Best Confusion Matrix:")
print(best_conf_matrix)



Random Forest:
Best Vector Size: 7
Best Accuracy: 0.862533692722372
Best Precision: 0.21666666666666667
Best Recall: 0.9871382636655949
Best Confusion Matrix:
[[ 13  47]
 [  4 307]]


In [138]:
# Generate vectors with vector size 9
vectors = df_to_vectors(df, 9)
batch_statuses = df['Batch Status'].values[10:]  # Adjust index for vector size 9
# Append the last row as a 1
batch_statuses = np.append(batch_statuses, 1)
X = vectors
y = batch_statuses

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree classifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_clf.predict(X_test)

# Calculate confusion matrix
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)

# Print confusion matrix
print("Confusion Matrix on test set (Decision Tree):\n", conf_matrix_dt)
tree_depth = dt_clf.tree_.max_depth
print("Tree Depth:", tree_depth)

Confusion Matrix on test set (Decision Tree):
 [[ 26  54]
 [ 52 239]]
Tree Depth: 18


In [50]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
import numpy as np

# Function to calculate precision from confusion matrix
def calculate_precision(conf_matrix):
    return conf_matrix[0, 0] / (conf_matrix[0, 1] + conf_matrix[0, 0])

# Initialize best metrics and best confusion matrix
best_accuracy = 0
best_precision = 0
best_recall = 0
best_vector_size = 0
best_conf_matrix = None

# Try different vector sizes from 5 to 40
for vector_size in range(5, 41):
    vectors = df_to_vectors(df, vector_size)
    
    batch_statuses = df['Batch Status'].values[vector_size + 1:]
    # Append the last row as a 1 
    batch_statuses = np.append(batch_statuses, 1)
    # Split the data into features (X) and target (y)
    X = vectors
    y = batch_statuses  # Target is the last column

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Logistic Regression classifier
    lr_clf = LogisticRegression()
    lr_clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred_lr = lr_clf.predict(X_test)

    # Calculate confusion matrix
    conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
    
    # Calculate accuracy, precision, and recall precis
    accuracy = accuracy_score(y_test, y_pred_lr)
    precis = calculate_precision(conf_matrix_lr)
    recall = recall_score(y_test, y_pred_lr)

    # Update best metrics and vector size if current precision is higher
    if precis > best_precision:
        best_accuracy = accuracy
        best_precision = precis
        best_recall = recall
        best_vector_size = vector_size
        best_conf_matrix = conf_matrix_lr

print("Logistic Regression:")
print("Best Vector Size:", best_vector_size)
print("Best Accuracy:", best_accuracy)
print("Best Precision:", best_precision)
print("Best Recall:", best_recall)
print("Best Confusion Matrix:")
print(best_conf_matrix)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression:
Best Vector Size: 32
Best Accuracy: 0.7923497267759563
Best Precision: 0.3188405797101449
Best Recall: 0.9023569023569024
Best Confusion Matrix:
[[ 22  47]
 [ 29 268]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
# adjusting depth of decision tree to maximize accuracy
# Generate vectors with vector size 9
vectors = df_to_vectors(df, 9)
batch_statuses = df['Batch Status'].values[10:]  # Adjust index for vector size 9
# Append the last row as a 1
batch_statuses = np.append(batch_statuses, 1)
X = vectors
y = batch_statuses

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree classifier with maximum depth
max_depth = None  # Set to None initially
best_accuracy = 0
for depth in range(1, 20):  # Try depths from 1 to 10
    dt_clf = DecisionTreeClassifier(max_depth=depth)
    dt_clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred_dt = dt_clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_dt)

    # Update best accuracy and depth if current accuracy is higher
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        max_depth = depth

# Retrain Decision Tree classifier with best depth
dt_clf = DecisionTreeClassifier(max_depth=max_depth)
dt_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_clf.predict(X_test)

# Calculate confusion matrix
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)

# Print confusion matrix
print("Confusion Matrix on test set (Decision Tree):\n", conf_matrix_dt)
print("Best Depth for Decision Tree:", max_depth)
print("Best Accuracy:", best_accuracy)


Confusion Matrix on test set (Decision Tree):
 [[  0  80]
 [  0 291]]
Best Depth for Decision Tree: 1
Best Accuracy: 0.784366576819407
