<a href="https://colab.research.google.com/github/AhmedSafwatMohamed/Predictive-Maintenance-for-Vehicle-Health/blob/main/notebooks/model_building_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Objective**

The purpose from notebook is to train and evaluate classification models.

# **Import libraries**

In [16]:
# Core Libraries
import pandas as pd
import numpy as np

# Train/Test Split
from sklearn.model_selection import train_test_split, GridSearchCV

# Handling Class Imbalance
from imblearn.over_sampling import SMOTE

# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Evaluation Metrics
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# Pipeline Utilities (optional, for chaining steps)
from imblearn.pipeline import Pipeline

# Warnings (optional: suppress warnings for cleaner output)
import warnings
warnings.filterwarnings('ignore')


# **Load and preview the dataset**

In [10]:
# Load the data into a DataFrame
df = pd.read_csv(
    'https://raw.githubusercontent.com/AhmedSafwatMohamed/Predictive-Maintenance-for-Vehicle-Health/main/data/cleaned-data.csv'
)

df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

# Preview the DataFrame
df.head()

Unnamed: 0,Make_and_Model,Year_of_Manufacture,Vehicle_Type,Usage_Hours,Route_Info,Load_Capacity,Actual_Load,Maintenance_Type,Maintenance_Cost,Tire_Pressure,...,Predictive_Score,Maintenance_Required,Weather_Conditions,Road_Conditions,Delivery_Times,Downtime_Maintenance,Impact_on_Efficiency,Maintenance_Year,Maintenance_Month,Maintenance_Day
0,1.0,2022.0,0.0,530.0,1.0,7.534549,9.004247,1.0,110.165442,20.0,...,0.171873,1.0,0.0,0.0,30.0,0.093585,0.150063,2023.0,4.0,9.0
1,3.0,2015.0,1.0,10679.0,1.0,7.671728,6.111785,2.0,265.898087,20.0,...,0.24667,1.0,0.0,1.0,30.0,3.361201,0.343017,2023.0,7.0,20.0
2,0.0,2022.0,1.0,4181.0,1.0,2.901159,3.006055,1.0,412.48347,55.0,...,0.455236,1.0,0.0,0.0,48.627823,1.3653,0.1,2023.0,3.0,17.0
3,0.0,2011.0,0.0,2974.0,2.0,15.893347,18.82529,2.0,444.110857,20.0,...,0.060208,1.0,0.0,0.0,30.0,0.0,0.135749,2024.0,5.0,1.0
4,1.0,2014.0,1.0,2539.0,1.0,60.66832,65.605463,2.0,478.841922,55.0,...,0.264929,1.0,1.0,2.0,300.0,6.608704,0.395193,2023.0,11.0,15.0


# **Splitting data**

In [14]:
# Create x and y data
x = df.drop(['Maintenance_Required'], axis=1)
y = df['Maintenance_Required']

x.shape # Showing X dimensions


(92000, 26)

In [15]:
# Split data to train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# **Address the class imbalance**

In [19]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply only on the training set
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# Check new class distribution
y_train_resampled.value_counts()


Unnamed: 0_level_0,count
Maintenance_Required,Unnamed: 1_level_1
1.0,56566
0.0,56566


# **Normalize data**

In [20]:
  # Initialize the scaler
scaler = StandardScaler()

# Fit on resampled training data, transform both train and test
x_train_scaled = scaler.fit_transform(x_train_resampled)
x_test_scaled = scaler.transform(x_test)


# **Build and test models**

In [25]:
# Define parameter grid
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

# Grid search
rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf_params,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
rf_grid.fit(x_train_scaled, y_train_resampled)

print("🔍 Best RF Parameters:", rf_grid.best_params_)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
🔍 Best RF Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


In [26]:
# Estimate scale_pos_weight (majority / minority)
scale_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# Define parameter grid
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.01],
}

# Grid search
xgb_grid = GridSearchCV(
    estimator=XGBClassifier(scale_pos_weight=scale_weight, use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=xgb_params,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
xgb_grid.fit(x_train_scaled, y_train_resampled)

print("🔍 Best XGB Parameters:", xgb_grid.best_params_)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
🔍 Best XGB Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
