## Model training

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings


In [5]:
pip install catboost

Defaulting to user installation because normal site-packages is not writeable
Collecting catboost
  Downloading catboost-1.2.8-cp39-cp39-win_amd64.whl (102.5 MB)
     -------------------------------------- 102.5/102.5 MB 2.3 MB/s eta 0:00:00
Collecting graphviz
  Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
     ---------------------------------------- 47.1/47.1 kB 2.3 MB/s eta 0:00:00
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2.8 graphviz-0.20.3
Note: you may need to restart the kernel to use updated packages.


In [41]:
## Import the CSV Data as Pandas Dataframe

wafers  = pd.read_csv("wafer_data.csv")

In [42]:
wafers.head()

Unnamed: 0.1,Unnamed: 0,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-6,Sensor-7,Sensor-8,Sensor-9,...,Sensor-582,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590,Good/Bad
0,Wafer-801,2968.33,2476.58,2216.7333,1748.0885,1.1127,100.0,97.5822,0.1242,1.53,...,,0.5004,0.012,0.0033,2.4069,0.0545,0.0184,0.0055,33.7876,-1
1,Wafer-802,2961.04,2506.43,2170.0666,1364.5157,1.5447,100.0,96.77,0.123,1.3953,...,,0.4994,0.0115,0.0031,2.302,0.0545,0.0184,0.0055,33.7876,1
2,Wafer-803,3072.03,2500.68,2205.7445,1363.1048,1.0518,100.0,101.8644,0.122,1.3896,...,,0.4987,0.0118,0.0036,2.3719,0.0545,0.0184,0.0055,33.7876,-1
3,Wafer-804,3021.83,2419.83,2205.7445,1363.1048,1.0518,100.0,101.8644,0.122,1.4108,...,,0.4934,0.0123,0.004,2.4923,0.0545,0.0184,0.0055,33.7876,-1
4,Wafer-805,3006.95,2435.34,2189.8111,1084.6502,1.1993,100.0,104.8856,0.1234,1.5094,...,,0.4987,0.0145,0.0041,2.8991,0.0545,0.0184,0.0055,33.7876,-1


In [43]:
X = wafers.drop(columns=['Unnamed: 0', 'Good/Bad'])

# The target variable (y) is the 'Good/Bad' column
y = wafers['Good/Bad']

# Display the first few rows of X and y to verify
print("First 5 rows of X (features):")
print(X.head())
print("\nFirst 5 rows of y (target):")
print(y.head())

First 5 rows of X (features):
   Sensor-1  Sensor-2   Sensor-3   Sensor-4  Sensor-5  Sensor-6  Sensor-7  \
0   2968.33   2476.58  2216.7333  1748.0885    1.1127     100.0   97.5822   
1   2961.04   2506.43  2170.0666  1364.5157    1.5447     100.0   96.7700   
2   3072.03   2500.68  2205.7445  1363.1048    1.0518     100.0  101.8644   
3   3021.83   2419.83  2205.7445  1363.1048    1.0518     100.0  101.8644   
4   3006.95   2435.34  2189.8111  1084.6502    1.1993     100.0  104.8856   

   Sensor-8  Sensor-9  Sensor-10  ...  Sensor-581  Sensor-582  Sensor-583  \
0    0.1242    1.5300    -0.0279  ...         NaN         NaN      0.5004   
1    0.1230    1.3953     0.0084  ...         NaN         NaN      0.4994   
2    0.1220    1.3896     0.0138  ...         NaN         NaN      0.4987   
3    0.1220    1.4108    -0.0046  ...         NaN         NaN      0.4934   
4    0.1234    1.5094    -0.0046  ...         NaN         NaN      0.4987   

   Sensor-584  Sensor-585  Sensor-586  Senso

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [45]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [49]:
# --- Preprocessing Steps to Handle NaN, Infinity, or large values ---

# 1. Replace infinite values with NaN for consistent imputation
# This step is important because SimpleImputer only handles NaNs by default.
# It's good practice to check for and handle infinities explicitly.
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# 2. Impute missing values (NaNs) using the mean strategy
# You can choose 'mean', 'median', or 'most_frequent' based on your data distribution
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform both training and test data
# It's crucial to fit the imputer ONLY on the training data to prevent data leakage.
X_imputed = imputer.fit_transform(X)

# Convert the imputed NumPy array back to a DataFrame, preserving column names
X = pd.DataFrame(X_imputed, columns=X.columns)

# Verify if there are any remaining NaNs or infinities (should be none after imputation)
print(f"Number of NaNs after imputation: {X.isnull().sum().sum()}")
print(f"Number of infinite values after imputation: {np.isinf(X).sum().sum()}")

# --- End of Preprocessing Steps ---


# Perform the train-test split on the preprocessed data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nShape of X_train after preprocessing: {X_train.shape}")
print(f"Shape of X_test after preprocessing: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Number of NaNs after imputation: 0
Number of infinite values after imputation: 0

Shape of X_train after preprocessing: (80, 590)
Shape of X_test after preprocessing: (20, 590)
Shape of y_train: (80,)
Shape of y_test: (20,)


In [51]:
# Initialize and train the Logistic Regression model
lr = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = lr.predict(X_test)
# Evaluate the model
print("\nLogistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Accuracy: 1.0


In [52]:
svm = SVC(kernel='rbf', probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))


SVM Accuracy: 1.0


In [53]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 1.0


In [54]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))


Gradient Boosting Accuracy: 1.0


In [56]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))


KNN Accuracy: 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [57]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.95


In [58]:
print(confusion_matrix(y_test, y_pred_rf))  # Replace with best model
print(classification_report(y_test, y_pred_rf))


[[20]]
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        20

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [59]:
scores = cross_val_score(rf, X, y, cv=5)
print("Random Forest Cross-validation Accuracy:", scores.mean())


Random Forest Cross-validation Accuracy: 0.9400000000000001


In [60]:
import joblib
joblib.dump(rf, 'best_wafer_model.pkl')


['best_wafer_model.pkl']