#### 1. Import Libraries

In [1]:

import pandas as pd
from scipy.io import arff
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import StandardScaler
import tensorflow as tf


#### 2. Load the Dataset and Pre-process it

In [None]:

data, meta = arff.loadarff('dataset/KC1.arff')
df = pd.DataFrame(data)

# Convert byte columns to strings for easier manipulation
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].apply(lambda x: x.decode('utf-8'))

In [3]:

# Display the first few rows of the dataset
print("Dataset Preview:")
print(df.head())


Dataset Preview:
   LOC_BLANK  BRANCH_COUNT  LOC_CODE_AND_COMMENT  LOC_COMMENTS  \
0        6.0          15.0                   1.0           3.0   
1        5.0          17.0                   0.0           7.0   
2       12.0          31.0                   0.0           0.0   
3        1.0           1.0                   0.0           0.0   
4        0.0           1.0                   0.0           0.0   

   CYCLOMATIC_COMPLEXITY  DESIGN_COMPLEXITY  ESSENTIAL_COMPLEXITY  \
0                    8.0                8.0                   1.0   
1                    9.0                8.0                   1.0   
2                   16.0               13.0                  10.0   
3                    1.0                1.0                   1.0   
4                    1.0                1.0                   1.0   

   LOC_EXECUTABLE  HALSTEAD_CONTENT  HALSTEAD_DIFFICULTY  ...  \
0            45.0             23.87                27.06  ...   
1            82.0             69.72      

In [4]:

# Check for class distribution (defects vs no defects)
print("\nClass Distribution:")
print(df['Defective'].value_counts())



Class Distribution:
Defective
N    868
Y    294
Name: count, dtype: int64


In [5]:

# Basic statistics on numeric columns
print("\nBasic Statistics:")
print(df.describe())



Basic Statistics:
         LOC_BLANK  BRANCH_COUNT  LOC_CODE_AND_COMMENT  LOC_COMMENTS  \
count  1162.000000   1162.000000           1162.000000   1162.000000   
mean      2.987091      7.245267              0.207401      1.648021   
std       4.751971      9.575895              0.910753      4.000758   
min       0.000000      1.000000              0.000000      0.000000   
25%       0.000000      1.000000              0.000000      0.000000   
50%       1.000000      3.000000              0.000000      0.000000   
75%       4.000000      9.000000              0.000000      2.000000   
max      58.000000     89.000000             12.000000     44.000000   

       CYCLOMATIC_COMPLEXITY  DESIGN_COMPLEXITY  ESSENTIAL_COMPLEXITY  \
count            1162.000000        1162.000000           1162.000000   
mean                4.130809           3.628227              2.167814   
std                 4.792339           4.152912              2.818708   
min                 1.000000           1

In [6]:
# Check for missing values
print("\nMissing Values in Each Column:")
print(df.isnull().sum())


Missing Values in Each Column:
LOC_BLANK                0
BRANCH_COUNT             0
LOC_CODE_AND_COMMENT     0
LOC_COMMENTS             0
CYCLOMATIC_COMPLEXITY    0
DESIGN_COMPLEXITY        0
ESSENTIAL_COMPLEXITY     0
LOC_EXECUTABLE           0
HALSTEAD_CONTENT         0
HALSTEAD_DIFFICULTY      0
HALSTEAD_EFFORT          0
HALSTEAD_ERROR_EST       0
HALSTEAD_LENGTH          0
HALSTEAD_LEVEL           0
HALSTEAD_PROG_TIME       0
HALSTEAD_VOLUME          0
NUM_OPERANDS             0
NUM_OPERATORS            0
NUM_UNIQUE_OPERANDS      0
NUM_UNIQUE_OPERATORS     0
LOC_TOTAL                0
Defective                0
dtype: int64


In [7]:
# Encode 'Defective' column to binary (e.g., 'defects' = 1, 'no defects' = 0)
df['Defective'] = df['Defective'].apply(lambda x: 1 if x == 'Y' else 0)

In [8]:
# Split data into features (X) and target (y)
X = df.drop(columns=['Defective', 'BRANCH_COUNT', 'LOC_CODE_AND_COMMENT', 'DESIGN_COMPLEXITY', 'ESSENTIAL_COMPLEXITY', 'HALSTEAD_CONTENT', 'HALSTEAD_LEVEL' ])
y = df['Defective']

In [9]:

# Display independent variables
X

Unnamed: 0,LOC_BLANK,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL
0,6.0,3.0,8.0,45.0,27.06,17479.25,0.22,124.0,971.07,645.97,46.0,78.0,17.0,20.0,57.0
1,5.0,7.0,9.0,82.0,22.29,34646.70,0.52,274.0,1924.82,1554.24,107.0,167.0,36.0,15.0,96.0
2,12.0,0.0,16.0,95.0,33.33,66290.05,0.66,334.0,3682.78,1988.70,140.0,194.0,42.0,20.0,112.0
3,1.0,0.0,1.0,12.0,2.50,558.70,0.07,47.0,31.04,223.48,22.0,25.0,22.0,5.0,15.0
4,0.0,0.0,1.0,3.0,1.50,37.90,0.01,9.0,2.11,25.27,4.0,5.0,4.0,3.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,3.0,0.0,2.0,21.0,9.26,3363.54,0.12,74.0,186.86,363.11,32.0,42.0,19.0,11.0,28.0
1158,4.0,3.0,7.0,33.0,10.23,4262.28,0.14,80.0,236.79,416.76,30.0,50.0,22.0,15.0,44.0
1159,2.0,2.0,9.0,50.0,20.70,21261.63,0.34,193.0,1181.20,1027.13,69.0,124.0,25.0,15.0,58.0
1160,19.0,3.0,14.0,94.0,26.55,55057.45,0.69,328.0,3058.75,2073.59,140.0,188.0,58.0,22.0,118.0


In [10]:
# Display dependent variable
y

0       0
1       1
2       1
3       0
4       0
       ..
1157    0
1158    0
1159    1
1160    0
1161    0
Name: Defective, Length: 1162, dtype: int64

In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
X_train.shape

(929, 15)

In [42]:
X_test.shape

(233, 15)

In [40]:
X.columns

Index(['LOC_BLANK', 'LOC_COMMENTS', 'CYCLOMATIC_COMPLEXITY', 'LOC_EXECUTABLE',
       'HALSTEAD_DIFFICULTY', 'HALSTEAD_EFFORT', 'HALSTEAD_ERROR_EST',
       'HALSTEAD_LENGTH', 'HALSTEAD_PROG_TIME', 'HALSTEAD_VOLUME',
       'NUM_OPERANDS', 'NUM_OPERATORS', 'NUM_UNIQUE_OPERANDS',
       'NUM_UNIQUE_OPERATORS', 'LOC_TOTAL'],
      dtype='object')

#### 3. Model Training

##### 3.1 Random Forest Classifier

In [12]:
# Train a Random Forest Classifier
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

In [13]:
# Make predictions
y_pred = model_rf.predict(X_test)

In [14]:
# Evaluate the model
print("\nModel Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Model Performance:
Accuracy: 0.7765
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       265
           1       0.55      0.37      0.44        84

    accuracy                           0.78       349
   macro avg       0.69      0.64      0.65       349
weighted avg       0.76      0.78      0.76       349

Confusion Matrix:
[[240  25]
 [ 53  31]]


##### 3.2 Gaussian Classifier

In [15]:
# Instantiate and Train the Gaussian Naive Bayes model
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred_nb = model_nb.predict(X_test)

In [17]:
# Evaluate the model
print("\nNaive Bayes Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))


Naive Bayes Model Performance:
Accuracy: 0.7421
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       265
           1       0.44      0.29      0.35        84

    accuracy                           0.74       349
   macro avg       0.62      0.59      0.59       349
weighted avg       0.71      0.74      0.72       349

Confusion Matrix:
[[235  30]
 [ 60  24]]


##### 3.3 Logistic Regressor

In [18]:
# Instantiate and Train the Logistic Regression model
model_lr = LogisticRegression(solver = 'lbfgs',max_iter=10000)
model_lr.fit(X_train, y_train)

In [19]:
# Make predictions
y_pred_log_reg = model_lr.predict(X_test)

In [20]:
# Evaluate the model
print("\nLogistic Regression Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))


Logistic Regression Model Performance:
Accuracy: 0.7794
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.95      0.87       265
           1       0.61      0.24      0.34        84

    accuracy                           0.78       349
   macro avg       0.70      0.59      0.60       349
weighted avg       0.75      0.78      0.74       349

Confusion Matrix:
[[252  13]
 [ 64  20]]


##### 3.4 XGBClassifier

In [21]:
# Instantiate and Train the XGBoost model
model_xgb = XGBClassifier(eval_metric='logloss')
model_xgb.fit(X_train, y_train)

In [22]:
# Make predictions
y_pred_xgb = model_xgb.predict(X_test)

In [23]:
# Evaluate the model
print("\nXGBoost Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))


XGBoost Model Performance:
Accuracy: 0.7851
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       265
           1       0.57      0.44      0.50        84

    accuracy                           0.79       349
   macro avg       0.70      0.67      0.68       349
weighted avg       0.77      0.79      0.78       349

Confusion Matrix:
[[237  28]
 [ 47  37]]


##### Try balancing the dataset using SMOTE and Train it on RF

In [24]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Train RandomForest on balanced data
model_rf_smote = RandomForestClassifier(random_state=42)
model_rf_smote.fit(X_res, y_res)

# Predictions
y_pred_rf_smote = model_rf_smote.predict(X_test)

# Evaluation
print("Random Forest with SMOTE Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf_smote):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf_smote))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_smote))


Random Forest with SMOTE Performance:
Accuracy: 0.7880
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.87      0.86       265
           1       0.56      0.54      0.55        84

    accuracy                           0.79       349
   macro avg       0.71      0.70      0.71       349
weighted avg       0.78      0.79      0.79       349

Confusion Matrix:
[[230  35]
 [ 39  45]]


##### 3.5 Combining RF, LR, XGB, SVC and NB 

In [25]:
# Define base learners
base_learners = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('lr', LogisticRegression(solver='lbfgs',max_iter=10000)),
    ('xgb', XGBClassifier(eval_metric='logloss')),
    ('svc', SVC(kernel='rbf', probability=True))
]

# Stacking Classifier
model_stacked = StackingClassifier(estimators=base_learners, final_estimator=GaussianNB())

# Train and predict
model_stacked.fit(X_train, y_train)
y_pred_stack = model_stacked.predict(X_test)

# Evaluate
print("Stacking Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_stack))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack))


Stacking Model Performance:
Accuracy: 0.7851
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       265
           1       0.57      0.42      0.48        84

    accuracy                           0.79       349
   macro avg       0.70      0.66      0.67       349
weighted avg       0.77      0.79      0.77       349

Confusion Matrix:
[[239  26]
 [ 49  35]]


##### Try applying SMOTE for Stacked Model 

In [26]:
# Step 1: Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Step 2: Define the base learners for stacking
base_learners = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('lr', LogisticRegression(solver='lbfgs',max_iter=10000)),
    ('xgb', XGBClassifier(eval_metric='logloss')),
    ('svc', SVC(kernel='rbf', probability=True))
]

# Step 3: Define the stacking classifier with SMOTE-applied training data
model_stacked_smote = StackingClassifier(estimators=base_learners, final_estimator=GaussianNB())

# Train the stacking model on SMOTE-balanced data
model_stacked_smote.fit(X_res, y_res)

# Step 4: Predict and evaluate on the original test data
y_pred_stack_smote = model_stacked_smote.predict(X_test)

# Step 5: Evaluate the performance
print("Stacking Model with SMOTE Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack_smote):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_stack_smote))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack_smote))


Stacking Model with SMOTE Performance:
Accuracy: 0.7736
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       265
           1       0.53      0.52      0.53        84

    accuracy                           0.77       349
   macro avg       0.69      0.69      0.69       349
weighted avg       0.77      0.77      0.77       349

Confusion Matrix:
[[226  39]
 [ 40  44]]


##### Save the Best ML Model

In [None]:
with open('../models/ml_model.pkl', 'wb') as model_file:
    pickle.dump(model_rf_smote, model_file)
print("Model saved successfully!")

Model saved successfully!


#### 3.6 CNN

In [28]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the neural network model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, 
                    epochs=200, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.5101 - loss: 0.7137 - val_accuracy: 0.7634 - val_loss: 0.5300
Epoch 2/200
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7200 - loss: 0.5738 - val_accuracy: 0.7903 - val_loss: 0.5024
Epoch 3/200
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7562 - loss: 0.5480 - val_accuracy: 0.7957 - val_loss: 0.4983
Epoch 4/200
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7839 - loss: 0.5210 - val_accuracy: 0.7796 - val_loss: 0.4979
Epoch 5/200
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7575 - loss: 0.5625 - val_accuracy: 0.7849 - val_loss: 0.4954
Epoch 6/200
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7432 - loss: 0.5661 - val_accuracy: 0.7796 - val_loss: 0.4941
Epoch 7/200
[1m24/24[0m [32m━━

##### Save the CNN Model

In [None]:
with open('../models/dl_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
print("Model saved successfully!")

Model saved successfully!
