<a href="https://colab.research.google.com/github/AhmedSafwatMohamed/Predictive-Maintenance-for-Vehicle-Health/blob/main/notebooks/data-cleaning-preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Objective**

The purpose from notebook is to Clean and preprocess the data,and export the clean data to use in the model.

# **Import libraries**

In [9]:
import pandas as pd          # For loading and manipulating data
import numpy as np           # For numerical operations




import warnings
warnings.filterwarnings('ignore')  # To suppress warnings

from sklearn.preprocessing import LabelEncoder # To lable encode the catogrical variables




# **Load and preview the dataset**

In [2]:
# Load the data into a DataFrame
df = pd.read_csv(
    'https://raw.githubusercontent.com/AhmedSafwatMohamed/Predictive-Maintenance-for-Vehicle-Health/main/data/raw-data.csv'
)

# Preview the DataFrame
df.head()

Unnamed: 0,Vehicle_ID,Make_and_Model,Year_of_Manufacture,Vehicle_Type,Usage_Hours,Route_Info,Load_Capacity,Actual_Load,Last_Maintenance_Date,Maintenance_Type,...,Brake_Condition,Failure_History,Anomalies_Detected,Predictive_Score,Maintenance_Required,Weather_Conditions,Road_Conditions,Delivery_Times,Downtime_Maintenance,Impact_on_Efficiency
0,1,Ford F-150,2022,Truck,530,Rural,7.534549,9.004247,2023-04-09,Oil Change,...,Good,1,0,0.171873,1,Clear,Highway,30.0,0.093585,0.150063
1,2,Volvo FH,2015,Van,10679,Rural,7.671728,6.111785,2023-07-20,Tire Rotation,...,Fair,1,0,0.24667,1,Clear,Rural,30.0,3.361201,0.343017
2,3,Chevy Silverado,2022,Van,4181,Rural,2.901159,3.006055,2023-03-17,Oil Change,...,Good,1,1,0.455236,1,Clear,Highway,48.627823,1.3653,0.1
3,4,Chevy Silverado,2011,Truck,2974,Urban,15.893347,18.82529,2024-05-01,Tire Rotation,...,Good,0,1,0.060208,1,Clear,Highway,30.0,0.0,0.135749
4,5,Ford F-150,2014,Van,2539,Rural,60.66832,65.605463,2023-11-15,Tire Rotation,...,Good,1,1,0.264929,1,Rainy,Urban,300.0,6.608704,0.395193


# **Data cleaning**

In [4]:
# remove unneeded  features
df = df.drop(['Vehicle_ID', 'Engine_Temperature'], axis=1)
df.shape

(92000, 25)

In [5]:
# First, convert the column to datetime format
df['Last_Maintenance_Date'] = pd.to_datetime(df['Last_Maintenance_Date'], errors='coerce')

# Then, create new columns
df['Maintenance_Year'] = df['Last_Maintenance_Date'].dt.year
df['Maintenance_Month'] = df['Last_Maintenance_Date'].dt.month
df['Maintenance_Day'] = df['Last_Maintenance_Date'].dt.day

# Drop the original column
df.drop(columns=['Last_Maintenance_Date'], inplace=True)

# Preview the updated DataFrame
df[['Maintenance_Year', 'Maintenance_Month', 'Maintenance_Day']].head()


Unnamed: 0,Maintenance_Year,Maintenance_Month,Maintenance_Day
0,2023,4,9
1,2023,7,20
2,2023,3,17
3,2024,5,1
4,2023,11,15


In [6]:
# Label encoding categorical variables
# Define the list of categorical columns to encode
categorical_cols = [
    'Make_and_Model', 'Vehicle_Type', 'Route_Info',
    'Maintenance_Type', 'Brake_Condition',
    'Weather_Conditions', 'Road_Conditions'
]

# Initialize a dictionary to store encoders if needed later (e.g., for inverse transform)
label_encoders = {}

# Apply Label Encoding
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string in case of missing/mixed types
    label_encoders[col] = le  # Store encoder for future use

# Preview the encoded DataFrame
df[categorical_cols].head()


Unnamed: 0,Make_and_Model,Vehicle_Type,Route_Info,Maintenance_Type,Brake_Condition,Weather_Conditions,Road_Conditions
0,1,0,1,1,1,0,0
1,3,1,1,2,0,0,1
2,0,1,1,1,1,0,0
3,0,0,2,2,1,0,0
4,1,1,1,2,1,1,2


In [7]:
# Change data type to float
df = df.astype(float)

# Show data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92000 entries, 0 to 91999
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Make_and_Model        92000 non-null  float64
 1   Year_of_Manufacture   92000 non-null  float64
 2   Vehicle_Type          92000 non-null  float64
 3   Usage_Hours           92000 non-null  float64
 4   Route_Info            92000 non-null  float64
 5   Load_Capacity         92000 non-null  float64
 6   Actual_Load           92000 non-null  float64
 7   Maintenance_Type      92000 non-null  float64
 8   Maintenance_Cost      92000 non-null  float64
 9   Tire_Pressure         92000 non-null  float64
 10  Fuel_Consumption      92000 non-null  float64
 11  Battery_Status        92000 non-null  float64
 12  Vibration_Levels      92000 non-null  float64
 13  Oil_Quality           92000 non-null  float64
 14  Brake_Condition       92000 non-null  float64
 15  Failure_History    

# **Export cleaned data **

In [8]:
df.to_csv('data/cleaned-data.csv')

OSError: Cannot save file into a non-existent directory: 'data'