In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.ensemble import RandomForestRegressor

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
plt.style.use('ggplot')

In [2]:
data = pd.read_csv(r'C:\Users\pc\OneDrive\Desktop\Machine Learning Projects\Car Price Prediction\car_price_prediction.csv')
data.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


## ***----------------------------------  Preprocessing  ------------------------------------***

In [None]:
data.info()

In [None]:
# Check for missing values in the dataset
data.isnull().sum()

In [None]:
# Check for duplicated rows values in the dataset
data.drop_duplicates(inplace = True)
data.duplicated().sum()

In [None]:
# describe() numerical columns
data.select_dtypes(include =  'number').describe()

In [None]:
# describe() categorical columns
data.select_dtypes(include = 'object').describe()

In [None]:
# Top and Bottom Value Counts for Each Column
def Top_and_Bottom_Combined (df, col, n = 5):
    top_counts    = df[col].value_counts().head(n)
    bottom_counts = df[col].value_counts().tail(n)

    combined = pd.concat([top_counts, bottom_counts], axis = 0)
    combined.columns = 'Top_and_Bottom_Combined'
    return combined

for col in data.select_dtypes(include = 'object'):
    print(f'Top & Bottom Value Counts for {col}')
    print(Top_and_Bottom_Combined(df = data, col = col))
    print('-' * 50)

## ***------------------------  Handling Categorical Columns  --------------------------***

In [6]:
data['Gear box type'].unique()

array(['Automatic', 'Tiptronic', 'Variator', 'Manual'], dtype=object)

In [9]:
data['Levy'] = data['Levy'].replace('-', 0)
data['Levy'] = pd.to_numeric(data['Levy'])

data['Leather interior'] = data['Leather interior'].replace({'Yes' : 1, 'No' : 0})

data['Mileage'] = data['Mileage'].str.replace('km', '')
data['Mileage'] = pd.to_numeric(data['Mileage'])

data['Engine volume'] = data['Engine volume'].str.replace('Turbo', '')
data['Engine volume'] = pd.to_numeric(data['Engine volume'])

data['Doors'] = data['Doors'].replace({'04-May' : 4, '02-Mar' : 2, '>5' : 6})

In [10]:
one_hot_columns = ["Gear box type", "Drive wheels", "Wheel"]
data = pd.get_dummies(data, columns = one_hot_columns)

label_encode_columns = ['Manufacturer', 'Category', 'Color', 'Fuel type', 'Model']
label_encoder = LabelEncoder()

for col in label_encode_columns:
    data[col] = label_encoder.fit_transform(data[col])

## ***--------------------------------  Handling Outliers  ---------------------------------***

In [11]:
numerical_features = data.select_dtypes(include = 'number')
categorical_features = data.select_dtypes(include = 'object')

In [None]:
# Box Plot For each Numerical Feature Before Handling Outliers

for feature in numerical_features:
    plt.figure(figsize = (8, 4))
    sns.boxplot(data[feature])
    plt.title(f'Box Plot of {feature}')

In [None]:
def handle_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        upper_bound = Q3 + 1.5 * IQR
        lower_bound = Q1 - 1.5 * IQR

        # Trimming Outliers
        # df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

        # Capping Outliers
        df.loc[df[col] > upper_bound, col] = upper_bound
        df.loc[df[col] < lower_bound, col] = lower_bound
    
    return df

# Apply outlier handling to the Price and other numerical columns
handle_outliers(data, numerical_features)

In [None]:
# Box Plot For each Numerical Feature After Handling Outliers

for feature in numerical_features:
    plt.figure(figsize = (8, 4))
    sns.boxplot(data[feature])
    plt.title(f'Box Plot of {feature}')

## ***-----------------------------------  Visualization  ------------------------------------***

In [None]:
# Price Distribution
plt.figure(figsize = (6, 10))

sns.displot(data["Price"], bins = 30, kde = True, edgecolor = 'black')

plt.title('Price Distribution', fontsize = 16, fontweight = 'bold')
plt.xlabel('Price', fontsize = 12)
plt.ylabel('Count', fontsize = 12);

plt.xticks(rotation = 40);
plt.xlim([0, 50000])

plt.tight_layout()
plt.show()

In [None]:
# Car Category Distribution
plt.figure(figsize=(10, 6))

sns.countplot(x='Category', data=data)

plt.title('Car Category Distribution', fontsize = 16, fontweight = 'bold')
plt.xlabel('Category', fontsize = 12)
plt.ylabel('Count', fontsize = 12)

plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

In [None]:
# Fuel Type Distribution
fuel_control = data['Fuel type'].value_counts().reset_index()
fuel_control.drop(index = [5, 6], axis = 0, inplace = True)
colors = ['#FF6F61', '#6B5B95', '#88B04B', '#F7CAC9', '#92A8D1']
explode = [0.02, 0, 0, 0, 0]

plt.figure(figsize = (8, 8));

plt.pie(fuel_control['count'],
        labels = fuel_control['Fuel type'],
        explode = explode,
        colors = colors,
        autopct = '%1.1f%%',
        pctdistance=0.5,
        wedgeprops = {'edgecolor': 'black', 'antialiased': True},  # 'wedgeprops': Properties for the wedges, here we set the edge color to black
        textprops = {'fontsize': 10, 'color': 'black'});

In [None]:
# Price vs Some Features
x_axis_columns = ['Category', 'Fuel type', 'Mileage', 'Manufacturer']

for i, x_axis_column in enumerate(x_axis_columns):

    fig, ax = plt.subplots(figsize = (10, 6))
    x_avg_price = data.groupby(x_axis_column)['Price'].mean().reset_index()

    ax.plot(x_avg_price[x_axis_column], x_avg_price['Price'], linestyle = '--', marker = 'o')
    ax.set_title(f'{x_axis_column} vs Average Price', fontsize = 16, fontweight = 'bold')
    ax.set_xlabel(f'{x_axis_column}', fontsize = 12)
    ax.set_ylabel('Price', fontsize = 12)
    ax.tick_params(axis = 'x', rotation = 45)

    if x_axis_column == 'Manufacturer':
        ax.tick_params(axis = 'x', rotation = 90)

    plt.tight_layout()
    plt.show()

In [None]:
plt.figure(figsize = (10, 6))
sns.heatmap(numerical_features.corr(), annot = True)

## ***------------------------  Model Training and Evaluation  --------------------------***

In [20]:
X = data.drop(['Price'], axis = 1)
y = data['Price']

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 42)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

# Training set (X_train, y_train) : 70% of the total data (13246 rows)
# Validation set (X_val, y_val)   : 15% of the total data (2839  rows)
# Test set (X_test, y_test)       : 15% of the total data (2839  rows)

In [22]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)

rmse = mean_squared_error(y_val, y_val_pred, squared = False)
r2 = r2_score(y_val, y_val_pred)

print("Mean Squared Error:", rmse)
print("Model Score (R^2):", r2)