# Day 2: model training

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

In [10]:
#loading data
df = pd.read_csv('./data/data.csv')
Y = df['fail']
X = df.drop(columns=['fail'])
Y.value_counts() #checking for imbalanced data

fail
0    551
1    393
Name: count, dtype: int64

In [14]:
#Class for training model
class Model():
    def __init__(self, X, Y, scale=True):
        self.X = X
        self.Y = Y
        self.scale = scale
        self.scaler = StandardScaler()
        self.model = None
    def split_and_scale(self):
        if self.scale:
            scaler = self.scaler

            # Initial train+temp split
            X_train, X_temp, Y_train, y_temp = train_test_split(
                self.X, self.Y, test_size=0.3, stratify=self.Y, random_state=42
            )

            # Now splitting temp into validation and test (15% each)
            X_val, X_test, Y_val, Y_test = train_test_split(
                X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
            )
            # Scaling the data
            scaler.fit(X_train)
            X_train_scaled = scaler.transform(X_train)
            X_val_scaled = scaler.transform(X_val)
            X_test_scaled = scaler.transform(X_test)

            return {
                'X_train': X_train_scaled,
                'X_val': X_val_scaled,
                'X_test': X_test_scaled,
                'Y_test': Y_test,
                'Y_val': Y_val,
                'Y_train': Y_train
            }
        else:
            X_train, X_temp, Y_train, y_temp = train_test_split(
                self.X, self.Y, test_size=0.3, stratify=self.Y, random_state=42
            )

            X_val, X_test, Y_val, Y_test = train_test_split(
                X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
            )
            return {
                'X_train': X_train,
                'X_val': X_val,
                'X_test': X_test,
                'Y_test': Y_test,
                'Y_val': Y_val,
                'Y_train': Y_train
            }
    def train_and_evaluate(self, X_train, Y_train, X_val, Y_val, X_test, Y_test, model):
        model.fit(X_train, Y_train)
        Y_pred_val = model.predict(X_val)

        #confusion matrix
        cm = confusion_matrix(Y_val, Y_pred_val)

        #printing result
        print("Accuracy:", accuracy_score(Y_val, Y_pred_val))
        print("Precision:", precision_score(Y_val, Y_pred_val))
        print("Recall:", recall_score(Y_val, Y_pred_val))
        print("F1 Score:", f1_score(Y_val, Y_pred_val))


        return {
            "val_Accuracy": accuracy_score(Y_val, Y_pred_val),
            "val_Precision": precision_score(Y_val, Y_pred_val, zero_division=0),
            "val_Recall:": recall_score(Y_val, Y_pred_val),
            "val_F1:": f1_score(Y_val, Y_pred_val),
            'Model': model,
            'X_test': X_test,
            'Y_test': Y_test,
            'Confusion matrix': cm,
            'model_name': model.__class__.__name__,  # name as string
            'model_obj': model,                      # actual model object
            'params': model.get_params(),
            'scaler': self.scaler
        }
    def run(self, model):
        data = self.split_and_scale()
        return self.train_and_evaluate(
            data['X_train'], data['Y_train'],
            data['X_val'] , data['Y_val'],
            data['X_test'], data['Y_test'], 
            model
        )


In [15]:
xgb = XGBClassifier(
        n_jobs=-1,
        eval_metric='logloss',
        random_state=42
    )

model = Model(X, Y, False)
results = model.run(xgb)
X_test = results['X_test']
Y_test = results['Y_test']
model = results['Model']
Y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_pred_test)
test_precision = precision_score(Y_test, Y_pred_test)
test_recall = recall_score(Y_test, Y_pred_test)
test_f1score = f1_score(Y_test, Y_pred_test)

Accuracy: 0.9507042253521126
Precision: 0.9482758620689655
Recall: 0.9322033898305084
F1 Score: 0.9401709401709402


In [16]:
#saving model to a csv file
def log_results(results, hyperparameter, filepath="model_log.csv"):
    
    log = {
        "Model": results['model_name'],
        'Scaler': results['scaler'],
        'Accuracy': test_accuracy,
        'Precision': test_precision,
        'Recall': test_recall,
        'F1_Score': test_f1score,
        'Hyperparameters': hyperparameter
        }

    # Convert params dict to string
    log_copy = log.copy()
    print(log_copy)

    df = pd.DataFrame([log_copy])
    try:
        df.to_csv(filepath, mode='a', index=False, header=not pd.io.common.file_exists(filepath))
    except Exception as e:
        print("Logging failed:", e)
    

log_results(results, None)


{'Model': 'XGBClassifier', 'Scaler': StandardScaler(), 'Accuracy': 0.8873239436619719, 'Precision': 0.8771929824561403, 'Recall': 0.847457627118644, 'F1_Score': 0.8620689655172413, 'Hyperparameters': None}


# Day 3: Hyperparameter tuning

In [20]:
#hyperparameter tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

#defining parameters
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3]
}

#GridSearchCV with xgbclassifier
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring= 'f1',
    refit=True,
    verbose=2,
    n_jobs=-1
)

# Fitting the grid search
model = Model(X, Y, False)
splits = model.split_and_scale()
grid_search.fit(splits['X_train'], splits['Y_train'])

# Printting best parameters and scores for each metric
print("Best parameters:", grid_search.best_params_)
print("Best F1 score:", grid_search.best_score_)

# Getting the best model and evaluation
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best parameters: {'colsample_bytree': 1.0, 'gamma': 0.3, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
Best F1 score: 0.8896323607068994


In [25]:
#Using the best parameters
final_model = XGBClassifier(
        colsample_bytree=1.0,
        gamma=0.3,
        learning_rate=0.2,
        max_depth=5,
        n_estimators=100,
        subsample=1.0,
        random_state=42
    )

model = Model(X, Y, False)
results = model.run(final_model)
X_test = results['X_test']
Y_test = results['Y_test']
model = results['Model']
Y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_pred_test)
test_precision = precision_score(Y_test, Y_pred_test)
test_recall = recall_score(Y_test, Y_pred_test)
test_f1score = f1_score(Y_test, Y_pred_test)
log_results(results, '''colsample_bytree=1.0,gamma=0.3,learning_rate=0.2,max_depth=5,n_estimators=100,subsample=1.0,random_state=42''')

Accuracy: 0.9507042253521126
Precision: 0.9482758620689655
Recall: 0.9322033898305084
F1 Score: 0.9401709401709402
{'Model': 'XGBClassifier', 'Scaler': StandardScaler(), 'Accuracy': 0.8873239436619719, 'Precision': 0.864406779661017, 'Recall': 0.864406779661017, 'F1_Score': 0.864406779661017, 'Hyperparameters': 'colsample_bytree=1.0,gamma=0.3,learning_rate=0.2,max_depth=5,n_estimators=100,subsample=1.0,random_state=42'}
