# Silicon Wafer Fault Detection ML Code

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

In [None]:
df = pd.read_csv("uci-secom.csv",index_col=0)

# Cleaning Dataset

In [None]:
df.isnull().any().any()

In [None]:
df = df.replace('?', np.nan)
df = df.apply(lambda x: x.fillna(0),axis=0)
df.isnull().any().any()

# Selecting unique features

In [None]:
for i in range(len(df.columns) - 2, -1, -1):
    unique_vals = df.iloc[:,i].unique()
    print(len(unique_vals))
    if len(unique_vals)==1:
        df.drop(df.columns[i],axis=1, inplace=True)

In [None]:
# Rename the remaining features uniformly (excluding the last column)
df.columns = [str(i) for i in range(1, df.shape[1])] + [df.columns[-1]]
print(df.shape)

# Handling Outliers

In [None]:
def cap_outliers_iqr(dataframe, features, cap_value=1.5):
    for feature in features:
        Q1 = dataframe[feature].quantile(0.25)
        Q3 = dataframe[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - cap_value * IQR
        upper_bound = Q3 + cap_value * IQR
        dataframe[feature] = np.clip(dataframe[feature], lower_bound, upper_bound)
    return dataframe

all_features_to_process = [str(i) for i in range(1, 479)]  # Update the range

df_capped = cap_outliers_iqr(df.copy(), all_features_to_process)

plt.figure(figsize=(12, 15))

# Box plot before capping outliers
plt.subplot(2, 1, 1)
df[all_features_to_process].boxplot()
plt.title('Box Plot Before Capping Outliers (IQR)')

# Box plot after capping outliers
plt.subplot(2, 1, 2)
df_capped[all_features_to_process].boxplot()
plt.title('Box Plot After Capping Outliers (IQR)')

# Adjust layout for better visualization
plt.tight_layout()

# Show the plots
plt.show()

# updating df
df = df_capped


# Scaling

In [None]:
X = df.drop('Pass/Fail', axis=1)
X = X.to_numpy()
y = df[['Pass/Fail']]
y = y.squeeze()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Removing highly Correlated features

In [None]:

def remove_collinear_features(x, threshold):
    # Convert NumPy array to Pandas DataFrame
    x_df = pd.DataFrame(x, columns=[str(i) for i in range(1, x.shape[1] + 1)])

    corr_matrix = x_df.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i + 1):
            item = corr_matrix.iloc[j:(j + 1), (i + 1):(i + 2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])
                drops = set(drop_cols)
    x_df = x_df.drop(columns=drops)

    # Convert DataFrame back to NumPy array
    x_filtered = x_df.to_numpy()

    return x_filtered

# Assuming X is your NumPy array
X_filtered = remove_collinear_features(X, 0.9)

In [None]:
print("Original shape:", X.shape)
print("Shape after removing highly correlated features:", X_filtered.shape)
X=X_filtered

# Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X = pca.fit_transform(X)

In [None]:
X.shape

In [None]:
explained_variance = pca.explained_variance_ratio_
#print(explained_variance)
cumulative_explained_variance = explained_variance.cumsum()
#print(cumulative_explained_variance)
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.title('Scree Plot')
plt.show()

# Remove Dataset for Inferencing

In [None]:
# Assuming you want to remove 10 rows from both classes
num_rows_to_remove_per_class = 10

# Check the shapes and counts before removal
print("Original X size", X.shape)
print("Original y size", y.shape)
defected_count = y[y == -1].shape[0]
not_defected_count = y[y == 1].shape[0]
print("Defected count in original data:", defected_count)
print("Not defected count in original data:", not_defected_count)

# Identify the indices of rows to be removed for each class
true_indices_remove = y[y == 1].sample(num_rows_to_remove_per_class, random_state=42).index
false_indices_remove = y[y == -1].sample(num_rows_to_remove_per_class, random_state=42).index

# Create a DataFrame to store the removed rows
X_df=pd.DataFrame(X,index=y.index)
removed_data = X_df.loc[pd.Index(true_indices_remove).union(false_indices_remove)].copy()
removed_data=removed_data.join(y[true_indices_remove.union(false_indices_remove)])

# Drop the selected rows from X and y
X = X[~df.index.isin(true_indices_remove.union(false_indices_remove))]
y = y[~df.index.isin(true_indices_remove.union(false_indices_remove))]

# Check the new shapes and counts
print("new X size", X.shape)
print("new y size", y.shape)
print("Removed data shape:", removed_data.shape)

defected_count = y[y == -1].shape[0]
not_defected_count = y[y == 1].shape[0]
print("Defected count in new data:", defected_count)
print("Not defected count in new data:", not_defected_count)


In [None]:
y[y == -1] = 0

In [None]:
print(removed_data['Pass/Fail'])

In [None]:
removed_data['Pass/Fail'].replace(-1, 0, inplace=True)
print(removed_data['Pass/Fail'])

# Balancing

In [None]:
from imblearn.combine import SMOTETomek
smt = SMOTETomek()

In [None]:
defected_count = y.to_list().count(0)
not_defected_count =y.to_list().count(1)
print("Data set::")
print("Defected count:", defected_count)
print("Not defected count:", not_defected_count)

In [None]:
# Apply SMOTE to the dataset
X, y = smt.fit_resample(X, y)
# Print the balanced class distribution
print("Class distribution after applying SMOTE to dataset:")
print(y.value_counts())

In [None]:
defected_count = y.to_list().count(0)
not_defected_count =y.to_list().count(1)
print("Data set::")
print("Defected count:", defected_count)
print("Not defected count:", not_defected_count)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# K-means clustering

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Range of k values to test
k_values = range(1, 50)  # You can adjust this range

# Sum of squared distances for each k
ssd = []

# Fit K-Means for each k and compute sum of squared distances
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X_train)
    ssd.append(kmeans.inertia_)

# Plot the Elbow Method graph
plt.figure(figsize=(8, 6))
plt.plot(k_values, ssd, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Distances')
plt.grid(True)
plt.show()

In [None]:
# Import the K-Means clustering algorithm
from sklearn.cluster import KMeans

# Create a K-Means model with a specified number of clusters
n_clusters = 5 # You can adjust the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)

# Fit the K-Means model to your data
kmeans.fit(X_train)  # Use the training data for clustering

# Get the cluster assignments for each data point
train_cluster_labels = kmeans.predict(X_train)

# Plot the clusters in 2D
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_train[:, 0], y=X_train[:, 1], hue=train_cluster_labels, palette='viridis')
plt.title("K-Means Clustering")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title='Cluster')
plt.show()


# KNN

# Find Optimal value of K

In [None]:
from sklearn.neighbors import KNeighborsClassifier
error_rate = []
for i in range(1,50):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))
    
plt.plot(range(1,50),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
min_value = min(error_rate)  # Find the smallest value in the list
min_k = error_rate.index(min_value) 
print(min_value)
print(min_k)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors=min_k)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)

train_accuracy_k = knn.score(X_train, y_train)
test_accuracy_k = knn.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_k)
print("Test Accuracy: ",test_accuracy_k)

accuracy_k = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_k)

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_k=classification_report(y_test,pred)
print(report_k)
lines = report_k.split('\n')

In [None]:
from sklearn.metrics import roc_auc_score
f1_score_line = lines[2]
f1_score_k= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_k)

f1_score_line = lines[2]
precision_k= float(f1_score_line.split()[1])
print("Precision:", precision_k)

f1_score_line = lines[2]
recall_k= float(f1_score_line.split()[2])
print("Recall:", recall_k)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

In [None]:
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(y_test,pred)
plt.plot(fpr,tpr,label='ROC_curve(AUC={:.2f})'.format(auc_roc))
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
  
classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train, y_train)

In [None]:
pred = classifier.predict(X_test)

train_accuracy_l1 = classifier.score(X_train, y_train)
test_accuracy_l1 = classifier.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_l1)
print("Test Accuracy: ",test_accuracy_l1)

accuracy_l1 = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_l1)

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_l1=classification_report(y_test,pred)
print(report_l1)
lines = report_l1.split('\n')


In [None]:
f1_score_line = lines[2]
f1_score_l1= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_l1)

f1_score_line = lines[2]
precision_l1= float(f1_score_line.split()[1])
print("Precision:", precision_l1)

f1_score_line = lines[2]
recall_l1= float(f1_score_line.split()[2])
print("Recall:", recall_l1)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

# Boosting For Logistic Regression

In [None]:
from sklearn.ensemble import AdaBoostClassifier
classifier = LogisticRegression(max_iter=500)
boosted_classifier = AdaBoostClassifier(base_estimator=classifier, n_estimators=200)
boosted_classifier.fit(X_train, y_train)

# Make predictions on the test set
pred = boosted_classifier.predict(X_test)


In [None]:
train_accuracy_l = boosted_classifier.score(X_train, y_train)
test_accuracy_l = boosted_classifier.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_l)
print("Test Accuracy: ",test_accuracy_l)

accuracy_l = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_l)

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_l=classification_report(y_test,pred)
print(report_l)
lines = report_l.split('\n')

In [None]:
f1_score_line = lines[2]
f1_score_l= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_l)

f1_score_line = lines[2]
precision_l= float(f1_score_line.split()[1])
print("Precision:", precision_l)

f1_score_line = lines[2]
recall_l= float(f1_score_line.split()[2])
print("Recall:", recall_l)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

In [None]:
fpr,tpr,thresholds=roc_curve(y_test,pred)
plt.plot(fpr,tpr,label='ROC_curve(AUC={:.2f})'.format(auc_roc))
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

naive_bayes_classifier = BernoulliNB()
naive_bayes_classifier.fit(X_train, y_train)

pred = naive_bayes_classifier.predict(X_test)


In [None]:
train_accuracy_n1 = naive_bayes_classifier.score(X_train, y_train)
test_accuracy_n1 = naive_bayes_classifier.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_n1)
print("Test Accuracy: ",test_accuracy_n1)

accuracy_n1 = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_n1)

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_n1=classification_report(y_test,pred)
print(report_n1)
lines = report_n1.split('\n')

In [None]:
f1_score_line = lines[2]
f1_score_n1= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_n1)

f1_score_line = lines[2]
precision_n1= float(f1_score_line.split()[1])
print("Precision:", precision_n1)

f1_score_line = lines[2]  
recall_n1= float(f1_score_line.split()[2])
print("Recall:", recall_n1)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc_roc))
plt.plot([0, 1], [0, 1], 'k--')  # Plot the random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Boosting for Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier

naive_bayes = BernoulliNB()
boosted_naive_bayes = AdaBoostClassifier(base_estimator=naive_bayes, n_estimators=200)

boosted_naive_bayes.fit(X_train, y_train)

pred = boosted_naive_bayes.predict(X_test)


In [None]:
train_accuracy_n = boosted_naive_bayes.score(X_train, y_train)
test_accuracy_n = boosted_naive_bayes.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_n)
print("Test Accuracy: ",test_accuracy_n)

accuracy_n = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_n)

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_n=classification_report(y_test,pred)
print(report_n)
lines = report_n.split('\n')

In [None]:
f1_score_line = lines[2]
f1_score_n= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_n)

f1_score_line = lines[2]
precision_n= float(f1_score_line.split()[1])
print("Precision:", precision_n)

f1_score_line = lines[2]
recall_n= float(f1_score_line.split()[2])
print("Recall:", recall_n)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc_roc))
plt.plot([0, 1], [0, 1], 'k--')  # Plot the random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Decision Tree

In [None]:
# Import the DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(max_depth=10) 
dtree.fit(X_train, y_train)

In [None]:
y_pred = dtree.predict(X_test)

train_accuracy_d1 = dtree.score(X_train, y_train)
test_accuracy_d1 = dtree.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_d1)
print("Test Accuracy: ",test_accuracy_d1)

accuracy_d1 = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_d1)


In [None]:
# Import necessary libraries
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Plot the decision tree
plt.figure(figsize=(100,80))  # Adjust the figure size as needed
plot_tree(dtree, feature_names=df.columns[:-1], class_names=["Not Defected", "Defected"], filled=True, rounded=True)
plt.show()

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_d1=classification_report(y_test,pred)
print(report_d1)
lines = report_d1.split('\n')

In [None]:
f1_score_line = lines[2]
f1_score_d1= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_d1)

f1_score_line = lines[2]
precision_d1= float(f1_score_line.split()[1])
print("Precision:", precision_d1)

f1_score_line = lines[2]
recall_d1= float(f1_score_line.split()[2])
print("Recall:", recall_d1)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

# Gradient Boosting Ensembling for Decision Tree

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create the Stochastic Gradient Boosting classifier
sgb = GradientBoostingClassifier(n_estimators=200)

# Train the classifier
sgb.fit(X_train, y_train)

# Make predictions
pred = sgb.predict(X_test)

In [None]:
train_accuracy_d = sgb.score(X_train, y_train)
test_accuracy_d = sgb.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_d)
print("Test Accuracy: ",test_accuracy_d)

accuracy_d = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_d)

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_d=classification_report(y_test,pred)
print(report_d)
lines = report_d.split('\n')

In [None]:
f1_score_line = lines[2]
f1_score_d= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_d)

f1_score_line = lines[2]
precision_d= float(f1_score_line.split()[1])
print("Precision:", precision_d)

f1_score_line = lines[2] 
recall_d= float(f1_score_line.split()[2])
print("Recall:", recall_d)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc_roc))
plt.plot([0, 1], [0, 1], 'k--')  # Plot the random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
pred = rf_classifier.predict(X_test)

In [None]:
train_accuracy_r = rf_classifier.score(X_train, y_train)
test_accuracy_r = rf_classifier.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_r)
print("Test Accuracy: ",test_accuracy_r)

accuracy_r = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_r)

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_r=classification_report(y_test,pred)
print(report_r)
lines = report_r.split('\n')

In [None]:
f1_score_line = lines[2] 
f1_score_r= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_r)

f1_score_line = lines[2]  
precision_r= float(f1_score_line.split()[1])
print("Precision:", precision_r)

f1_score_line = lines[2]  
recall_r= float(f1_score_line.split()[2])
print("Recall:", recall_r)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc_roc))
plt.plot([0, 1], [0, 1], 'k--')  # Plot the random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()




















# Support Vector Machine

In [None]:
from sklearn.svm import SVC
# Create an SVM classifier
svm_classifier = SVC(kernel='rbf')

# Train the classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
pred = svm_classifier.predict(X_test)

In [None]:
train_accuracy_s = svm_classifier.score(X_train, y_train)
test_accuracy_s = svm_classifier.score(X_test, y_test)
print("Train Accuracy: ",train_accuracy_s)
print("Test Accuracy: ",test_accuracy_s)

accuracy_s = accuracy_score(y_test, pred)
print("Accuracy: ",accuracy_s)

In [None]:
print("Confusion matrix :")
print(confusion_matrix(y_test,pred))

print("Classification Report:")
report_s=classification_report(y_test,pred)
print(report_s)
lines = report_s.split('\n')

In [None]:
f1_score_line = lines[2] 
f1_score_s= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_s)

f1_score_line = lines[2]  
precision_s= float(f1_score_line.split()[1])
print("Precision:", precision_s)

f1_score_line = lines[2]  
recall_s= float(f1_score_line.split()[2])
print("Recall:", recall_s)

auc_roc = roc_auc_score(y_test, pred)
print("AUC-ROC:", auc_roc)

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc_roc))
plt.plot([0, 1], [0, 1], 'k--')  # Plot the random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()






# Fully Connected Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.initializers import HeUniform
import tensorflow.keras.backend as K
import tensorflow as tf

def custom_mlp_model():
    model = Sequential()

    initializer = HeUniform() 
    model.add(Dense(512, input_shape=(100,), kernel_initializer=initializer))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(128, kernel_initializer=initializer))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
 

    model.add(Dense(128, kernel_initializer=initializer))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    
 
    model.add(Dense(128, kernel_initializer=initializer))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
 
    

    

    model.add(Dense(1, activation='sigmoid'))  # Sigmoid activation for binary classification

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

model = custom_mlp_model()
model.summary()
history = model.fit(X_train, y_train, epochs=20, verbose=1, validation_data=(X_test, y_test))
test_loss_nn, test_accuracy_nn = model.evaluate(X_test, y_test)
train_accuracy_nn = max(history.history['accuracy'])

print("Train Accuracy of neural network: ", train_accuracy_nn)
print("Test Accuracy of neural network: ", test_accuracy_nn)


predictions = model.predict(X_test)
rounded_predictions = tf.where(predictions >= 0.5, 1.0, 0.0)
y_pred=rounded_predictions


In [None]:
from sklearn.metrics import accuracy_score

rounded_predictions = np.round(predictions, 2)
print(np.unique(rounded_predictions))

print(np.unique(y_pred))
print(np.unique(y_test))
train_accuracy = history.history['accuracy']
test_accuracy = history.history['val_accuracy']

train_loss = history.history['loss']
test_loss = history.history['val_loss']

# Print the final accuracies
final_train_accuracy = train_accuracy[-1]
final_test_accuracy = test_accuracy[-1]

print(f"Final Train Accuracy: {final_train_accuracy*100:.2f}%")
print(f"Final Test Accuracy: {final_test_accuracy*100:.2f}%")
accuracy_nn = accuracy_score(y_test, y_pred)

print("Accuracy", accuracy_nn)


In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

print("Classification Report:")
report_nn = classification_report(y_test,y_pred)
print(report_nn)


In [None]:
lines = report_nn.split('\n')
f1_score_line = lines[2] 
f1_score_nn= float(f1_score_line.split()[3])
print("F1 Score:", f1_score_nn)

f1_score_line = lines[2]  
precision_nn= float(f1_score_line.split()[1])
print("Precision:", precision_nn)

f1_score_line = lines[2]  
recall_nn= float(f1_score_line.split()[2])
print("Recall:", recall_nn)

auc_roc = roc_auc_score(y_test, y_pred)
print("AUC-ROC:", auc_roc)

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc_roc))
plt.plot([0, 1], [0, 1], 'k--')  # Plot the random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()






# Results

In [None]:
classes = ['KNN', 'Logistic Regression', 'Naive Bayes', 'Decision Tree', 'Random Forest','SVM','Neural Network']

values = [train_accuracy_k , train_accuracy_l, train_accuracy_n, train_accuracy_d, train_accuracy_r,train_accuracy_s,train_accuracy_nn]

# Data for the columns
labels = ['KNN', 'Logistic Regression', 'Naive Bayes','Decision Tree', 'Random Forest', 'SVM','Neural Network']
values1 = [train_accuracy_k , train_accuracy_l, train_accuracy_n, train_accuracy_d, train_accuracy_r, train_accuracy_s,train_accuracy_nn]
values2 = [test_accuracy_k , test_accuracy_l, test_accuracy_n, test_accuracy_d, test_accuracy_r, test_accuracy_s,test_accuracy_nn]
values3 = [accuracy_k , accuracy_l, accuracy_n, accuracy_d, accuracy_r, accuracy_s,accuracy_nn]

# Positions of the bars on the x-axis
x = np.arange(len(labels))

# Width of each bar
width = 0.15

# Plotting the bar graph
#print(values1)
plt.figure(figsize=(12, 8))
plt.bar(x - width, values1, width, label='Train Accuracy')
plt.bar(x, values2, width, label='Test Accuracy')
plt.bar(x + width, values3, width, label='Accuracy')

# Adding labels and title
plt.xlabel('Models')
plt.ylabel('accuracy')
plt.title('Bar Graph with accuracies')

# Setting the x-axis tick labels
plt.xticks(x, labels)

# Adding a legend
plt.legend(loc='lower right')

# Displaying the graph
plt.show()
classes = ['KNN', 'Logistic Regression', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'SVM','Neural Network']
values = [f1_score_k,f1_score_l ,f1_score_n ,f1_score_d, f1_score_r, f1_score_s,f1_score_nn]
plt.figure(figsize=(12, 8))
# Plotting the bar graph
plt.bar(classes, values)

# Setting the y-axis limits to range from 0 to 1
plt.ylim(0, 1)

# Adding labels and title

plt.xlabel('Models')
plt.ylabel('f1_score')
plt.title('Bar Graph of f1-Score')
plt.show()


In [None]:
from prettytable import PrettyTable

table = PrettyTable()
# Define the columns
table.field_names = ["Models", "Precision" , "Recall" , "F1-Score" , "Train Accuracy", "Test Accuracy", "Accuracy"]
# Add rows to the table
table.add_row(["KNN", precision_k , recall_k , f1_score_k , train_accuracy_k , test_accuracy_k , accuracy_k ])
table.add_row(["Logistic Regression", precision_l1 , recall_l1 , f1_score_l1 , train_accuracy_l1 , test_accuracy_l1 , accuracy_l1 ])
table.add_row(["LR with boosting", precision_l , recall_l , f1_score_l , train_accuracy_l , test_accuracy_l , accuracy_l ])
table.add_row(["Naive Bayes", precision_n1 , recall_n1 , f1_score_n1 , train_accuracy_n1 , test_accuracy_n1 , accuracy_n1 ])
table.add_row(["NB with boosting", precision_n , recall_n , f1_score_n , train_accuracy_n , test_accuracy_n , accuracy_n ])
table.add_row(["Decision Tree", precision_d1 , recall_d1 , f1_score_d1 , train_accuracy_d1 , test_accuracy_d1 , accuracy_d1 ])
table.add_row(["DT with boosting", precision_d , recall_d , f1_score_d , train_accuracy_d , test_accuracy_d , accuracy_d ])
table.add_row(["Random Forest", precision_r , recall_r , f1_score_r , train_accuracy_r , test_accuracy_r , accuracy_r ])
table.add_row(["SVM", precision_s , recall_s , f1_score_s , train_accuracy_s , test_accuracy_s , accuracy_s ])
table.add_row(["Neural Network", precision_nn , recall_nn , f1_score_nn , train_accuracy_nn , test_accuracy_nn , accuracy_nn ])
print(table)


In [None]:
print("#################################################Classification Report###########################################")
# Create a PrettyTable instance
table1 = PrettyTable()

# Define columns and set column widths
table1.field_names = ["kNN", "Logistic Regression"]
table1._max_width = {"Column 1": 90, "Column 2": 90}  # Adjust width as needed
table1.add_row([report_k, report_l1])
print(table1)
table2 = PrettyTable()
table2.field_names = ['Logistic Regression with boosting','Naive Bayes']
table2._max_width = {"Column 1": 90, "Column 2": 90}  # Adjust width as needed
table2.add_row([report_l, report_n1])
print(table2)
table3 = PrettyTable()
table3.field_names = ['Naive Bayes with boosting','Decision Tree']
table3._max_width = {"Column 1": 90, "Column 2": 90}  # Adjust width as needed
table3.add_row([report_n, report_d1])
print(table3)
table4 = PrettyTable()
table4.field_names = ['Decision Tree with boosting', 'Random Forest']
table4._max_width = {"Column 1": 90, "Column 2": 90}  # Adjust width as needed
table4.add_row([report_d, report_r])
print(table4)
table5 = PrettyTable()
table5.field_names = ['SVM', 'Neural Network']
table5._max_width = {"Column 1": 90, "Column 2": 90}  # Adjust width as needed
table5.add_row([report_s,report_nn])
print(table5)



# Inferencing

In [None]:
#data for inferencing
X_i=removed_data.drop('Pass/Fail', axis=1)
y_i=removed_data['Pass/Fail']
y_i = y_i.values

# Predictions from trained models
inference_k = knn.predict(X_i)
inference_l = boosted_classifier.predict(X_i)
inference_n = boosted_naive_bayes.predict(X_i)
inference_d = sgb.predict(X_i)
inference_r = rf_classifier.predict(X_i)
inference_s = svm_classifier.predict(X_i)
inference_nn = model.predict(X_i)


threshold = 0.5
inference_nn_binary = [1 if prediction > threshold else 0 for prediction in inference_nn]

# Convert the list to a NumPy array and print without commas and spaces
inference_nn_binary = np.array(inference_nn_binary)

#comparing True value and predicted value
true_predictions_nn = (inference_nn_binary == y_i)
true_predictions_k = (inference_k == y_i)
true_predictions_l = (inference_l == y_i)
true_predictions_n = (inference_n == y_i)
true_predictions_d = (inference_d == y_i)
true_predictions_r = (inference_r == y_i)
true_predictions_s = (inference_s == y_i)

# Calculate and display the number of true and false predictions
num_total_predictions=23

num_true_predictions_k = sum(true_predictions_k)
num_false_predictions_k = len(inference_k) - num_true_predictions_k
per_k = (num_true_predictions_k / num_total_predictions) * 100

num_true_predictions_l = sum(true_predictions_l)
num_false_predictions_l = len(inference_l) - num_true_predictions_l
per_l = (num_true_predictions_l / num_total_predictions) * 100

num_true_predictions_n = sum(true_predictions_n)
num_false_predictions_n = len(inference_n) - num_true_predictions_n
per_n = (num_true_predictions_n / num_total_predictions) * 100

num_true_predictions_d = sum(true_predictions_d)
num_false_predictions_d = len(inference_d) - num_true_predictions_d
per_d = (num_true_predictions_d / num_total_predictions) * 100

num_true_predictions_r = sum(true_predictions_r)
num_false_predictions_r = len(inference_r) - num_true_predictions_r
per_r=(num_true_predictions_r / num_total_predictions) * 100

num_true_predictions_s = sum(true_predictions_s)
num_false_predictions_s = len(inference_s) - num_true_predictions_s
per_s=(num_true_predictions_s / num_total_predictions) * 100

num_true_predictions_nn = sum(true_predictions_nn)
num_false_predictions_nn = len(inference_nn) - num_true_predictions_nn
per_nn = (num_true_predictions_nn / num_total_predictions) * 100

# Display output
print("                    Inferencing for 23 Data points")
print("-------------------------------------------------------------------------")
data = {'Num of True pred': [num_true_predictions_k, num_true_predictions_l, num_true_predictions_n, num_true_predictions_d, num_true_predictions_r, num_true_predictions_s, num_true_predictions_nn],
        'Num of False pred':[num_false_predictions_k, num_false_predictions_l, num_false_predictions_n, num_false_predictions_d, num_false_predictions_r,  num_false_predictions_s, num_false_predictions_nn],
        '% Correct': [per_k, per_l, per_n, per_d, per_r, per_s,per_nn]}
row_indices = ['KNN', 'Logistic Regression', 'Naive Bayes','Decision Tree', 'Random Forest' , 'SVM' , 'Neural Network']
df_final= pd.DataFrame(data, index=row_indices)

print(df_final)


In [None]:
print("                                             KNN")
print("Predicted: ",inference_k)
print("Actual   : ",y_i)
print("                                     Logistic Regression")
print("Predicted: ",inference_l)
print("Actual   : ",y_i)
print("                                         Naive Bayes")
print("Predicted: ",inference_n)
print("Actual   : ",y_i)
print("                                        Decision Tree")
print("Predicted: ",inference_d)
print("Actual   : ",y_i)
print("                                        Random Forest")
print("Predicted: ",inference_r)
print("Actual   : ",y_i)
print("                                             SVM")
print("Predicted: ",inference_s)
print("Actual   : ",y_i)
print("                                     Neural Network")
print("Predicted: ",inference_nn_binary)
print("Actual   : ",y_i)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Function to plot ROC curve for a model
def plot_roc_curve(model_name, y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    auc_value = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_value:.2f})')

# Plot ROC curves for all models
plot_roc_curve("KNN", y_test, knn.predict_proba(X_test)[:, 1])
plot_roc_curve("Logistic Regression", y_test, classifier.predict_proba(X_test)[:, 1])
plot_roc_curve("Boosted Logistic Regression", y_test, boosted_classifier.predict_proba(X_test)[:, 1])
plot_roc_curve("Naive Bayes", y_test, naive_bayes_classifier.predict_proba(X_test)[:, 1])
plot_roc_curve("Boosted Naive Bayes", y_test, boosted_naive_bayes.predict_proba(X_test)[:, 1])
plot_roc_curve("Decision Tree", y_test, dtree.predict_proba(X_test)[:, 1])
plot_roc_curve("Boosted Decision Tree", y_test, boosted_classifier.predict_proba(X_test)[:, 1])
plot_roc_curve("Random Forest", y_test, rf_classifier.predict_proba(X_test)[:, 1])
plot_roc_curve("SVM", y_test, svm_classifier.decision_function(X_test))
plot_roc_curve("Neural Network", y_test, model.predict(X_test).ravel())

# Customize the plot
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()
