# Vehicle Maintenance Prediction Project



In [22]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
data = pd.read_csv('./data_file/vehicle_maintenance_data.csv')  
data.head()


Unnamed: 0,Vehicle_Model,Mileage,Maintenance_History,Reported_Issues,Vehicle_Age,Fuel_Type,Transmission_Type,Engine_Size,Odometer_Reading,Last_Service_Date,Warranty_Expiry_Date,Owner_Type,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Tire_Condition,Brake_Condition,Battery_Status,Need_Maintenance
0,Truck,58765,Good,0,4,Electric,Automatic,2000,28524,2023-11-23,2025-06-24,Second,20782,6,3,13.622204,New,New,Weak,1
1,Van,60353,Average,1,7,Electric,Automatic,2500,133630,2023-09-21,2025-06-04,Second,23489,7,0,13.625307,New,New,Weak,1
2,Bus,68072,Poor,0,2,Electric,Automatic,1500,34022,2023-06-27,2025-04-27,First,17979,7,0,14.306302,New,Good,Weak,1
3,Bus,60849,Average,4,5,Petrol,Automatic,2500,81636,2023-08-24,2025-11-05,Second,6220,7,3,18.709467,New,Worn Out,New,1
4,Bus,45742,Poor,5,1,Petrol,Manual,2000,97162,2023-05-25,2025-09-14,Third,16446,6,2,16.977482,Good,Good,Weak,1


In [3]:
# Check for missing values
print("Missing values per column:\n", data.isnull().sum())

# Drop rows or handle missing values as needed
data = data.dropna()  # Example: Dropping missing rows

# Display basic statistics
data.describe()

Missing values per column:
 Vehicle_Model           0
Mileage                 0
Maintenance_History     0
Reported_Issues         0
Vehicle_Age             0
Fuel_Type               0
Transmission_Type       0
Engine_Size             0
Odometer_Reading        0
Last_Service_Date       0
Warranty_Expiry_Date    0
Owner_Type              0
Insurance_Premium       0
Service_History         0
Accident_History        0
Fuel_Efficiency         0
Tire_Condition          0
Brake_Condition         0
Battery_Status          0
Need_Maintenance        0
dtype: int64


Unnamed: 0,Mileage,Reported_Issues,Vehicle_Age,Engine_Size,Odometer_Reading,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Need_Maintenance
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,54931.23268,2.49742,5.49226,1556.292,75551.18706,17465.3407,5.51556,1.50156,14.990323,0.80996
std,14401.912925,1.708781,2.875682,627.677218,43088.105658,7223.393401,2.874899,1.11951,2.885583,0.392336
min,30001.0,0.0,1.0,800.0,1001.0,5000.0,1.0,0.0,10.000098,0.0
25%,42471.5,1.0,3.0,1000.0,38009.0,11189.75,3.0,0.0,12.489037,1.0
50%,54810.0,2.0,5.0,1500.0,75598.5,17477.5,6.0,2.0,14.986352,1.0
75%,67391.5,4.0,8.0,2000.0,112999.5,23692.0,8.0,3.0,17.474676,1.0
max,80000.0,5.0,10.0,2500.0,149999.0,30000.0,10.0,3.0,19.999968,1.0


In [8]:
# Encode categorical features (if any)
data = pd.get_dummies(data, drop_first=True)

In [23]:
# Define features (X) and target (y)
X = data.drop('Need_Maintenance', axis=1)  # Replace with target column
y = data['Need_Maintenance']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# Model 3: K-Nearest Neighbors Classifier
knn = KNeighborsClassifier(n_neighbors=5)  # Modify n_neighbors as needed
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Evaluation metrics
print("K-Nearest Neighbors Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


K-Nearest Neighbors Accuracy: 0.7837
Classification Report:
               precision    recall  f1-score   support

           0       0.32      0.11      0.17      1915
           1       0.82      0.94      0.88      8085

    accuracy                           0.78     10000
   macro avg       0.57      0.53      0.52     10000
weighted avg       0.72      0.78      0.74     10000

Confusion Matrix:
 [[ 220 1695]
 [ 468 7617]]


### K-Nearest Neighbors Model Summary:

The K-Nearest Neighbors (KNN) classifier achieved an accuracy of 78.37%. However, the precision and recall for predicting the "Need_Maintenance" class (1) were high (0.82 and 0.94), indicating that the model is good at identifying maintenance needs. On the other hand, the precision and recall for class 0 (no maintenance needed) were low (0.32 and 0.11), meaning it struggles to correctly classify instances where no maintenance is required. This could suggest class imbalance, with more focus on correctly predicting class 1.

### Evaluation:
- **Accuracy**: 78.37%
- **Precision**: 0.82 (class 1), 0.32 (class 0)
- **Recall**: 0.94 (class 1), 0.11 (class 0)
- **F1-Score**: 0.88 (class 1), 0.17 (class 0)



In [16]:
#Model 2: Logistic Regression Model

feature_list = ['Vehicle_Model', 'Mileage', 'Maintenance_History', 'Reported_Issues', 'Vehicle_Age', 
                'Fuel_Type', 'Transmission_Type', 'Engine_Size', 'Odometer_Reading', 'Last_Service_Date', 
                'Warranty_Expiry_Date', 'Owner_Type', 'Insurance_Premium', 'Service_History', 
                'Accident_History', 'Fuel_Efficiency', 'Tire_Condition', 'Brake_Condition', 'Battery_Status']

X = data[feature_list]  # Features
y = data['Need_Maintenance']  # Target variable

# Data preprocessing
X = pd.get_dummies(X, drop_first=True)  # Convert categorical features into dummy variables
scaler = StandardScaler()
X = scaler.fit_transform(X)  # Standardize the features

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.9441
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85      1915
           1       0.96      0.97      0.97      8085

    accuracy                           0.94     10000
   macro avg       0.91      0.90      0.91     10000
weighted avg       0.94      0.94      0.94     10000

Confusion Matrix:
 [[1608  307]
 [ 252 7833]]


### Logistic Regression Model Summary:

The Logistic Regression model performed well with an accuracy of 94.41%. The model displayed high precision and recall for class 1 (maintenance needed), with values of 0.96 and 0.97, respectively. For class 0 (no maintenance), the precision and recall were 0.86 and 0.84. Overall, the model showed good balance between correctly identifying both classes, particularly excelling at predicting maintenance needs. 

### Evaluation:
- **Accuracy**: 94.41%
- **Precision**: 0.96 (class 1), 0.86 (class 0)
- **Recall**: 0.97 (class 1), 0.84 (class 0)
- **F1-Score**: 0.97 (class 1), 0.85 (class 0)

In [17]:
#Model 3: Random Forest
rf = RandomForestClassifier(random_state=42)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Predict on the test data
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1915
           1       1.00      1.00      1.00      8085

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Confusion Matrix:
 [[1915    0]
 [   0 8085]]


### Random Forest Model Summary:

The Random Forest model achieved perfect accuracy of 100%, with both precision and recall for both classes (0: no maintenance, 1: maintenance needed) equal to 1.0. This indicates that the model was able to perfectly classify all instances, without any misclassification. Both the confusion matrix and classification report show ideal performance, where all predictions were correct.

### Evaluation:
- **Accuracy**: 100%
- **Precision**: 1.00 (both classes)
- **Recall**: 1.00 (both classes)
- **F1-Score**: 1.00 (both classes)

In [19]:
#Model 4: XGBoost

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluation
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1915
           1       1.00      1.00      1.00      8085

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Confusion Matrix:
 [[1915    0]
 [   0 8085]]


### XGBoost Model Summary:

The XGBoost model achieved a perfect accuracy of 100%, with both precision and recall for each class (0: no maintenance, 1: maintenance needed) equal to 1.0. The classification report and confusion matrix indicate flawless performance, where all predictions were correct without any misclassifications.

### Evaluation:
- **Accuracy**: 100%
- **Precision**: 1.00 (both classes)
- **Recall**: 1.00 (both classes)
- **F1-Score**: 1.00 (both classes)