<a href="https://colab.research.google.com/github/Ahmedtarekyoussef/Machine-Learning-Models/blob/main/PredictCarPrice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setup for the dataset**

In [1]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (15).json


{'kaggle.json': b'{"username":"ahmedtarek122","key":"8649a23c261b9b7a8eaf365c22cb931a"}'}

In [3]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [4]:
! kaggle datasets download -d deepcontractor/car-price-prediction-challenge

car-price-prediction-challenge.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
# Reading the csv file

import pandas as pd

df = pd.read_csv('car-price-prediction-challenge.zip')
df.head(1)
#df

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12


# **Data Inspection & Pre-processing**

In [6]:
# used to know how many columns and rows do we have
num_rows, num_cols = df.shape
print(f'The dataframe has {num_rows} rows and {num_cols} columns.')

The dataframe has 19237 rows and 18 columns.


In [7]:
#Used to know the datatypes of all the columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [8]:
#Removing the "km" phrase from the mileage column to convert its data type to int

df['Mileage'] = df['Mileage'].str.replace('km', '')
df['Mileage'] = df['Mileage'].astype(int)
print(df['Mileage'].dtype)

int64


In [9]:
# Data cleaning for the "Doors" column

df['Doors'] = df['Doors'].str.replace('May', '5')
df['Doors'] = df['Doors'].str.replace('Mar', '3')
df['Doors'].value_counts()

04-5    18332
02-3      777
>5        128
Name: Doors, dtype: int64

In [10]:
# The ID and levy columns is usless so we will drop them
df = df.drop('ID', axis=1)
df = df.drop('Levy', axis=1)

In [11]:
df.tail(1)

Unnamed: 0,Price,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
19236,470,HYUNDAI,Sonata,2012,Sedan,Yes,Hybrid,2.4,186923,4.0,Automatic,Front,04-5,Left wheel,White,12


In [12]:
#used to make sure that there is no null values in all columns

print(df.isnull().sum())

Price               0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64


In [13]:
#To get a statistical veiw of the numerical data types

df.describe()

Unnamed: 0,Price,Prod. year,Mileage,Cylinders,Airbags
count,19237.0,19237.0,19237.0,19237.0,19237.0
mean,18555.93,2010.912824,1532236.0,4.582991,6.582627
std,190581.3,5.668673,48403870.0,1.199933,4.320168
min,1.0,1939.0,0.0,1.0,0.0
25%,5331.0,2009.0,70139.0,4.0,4.0
50%,13172.0,2012.0,126000.0,4.0,6.0
75%,22075.0,2015.0,188888.0,4.0,12.0
max,26307500.0,2020.0,2147484000.0,16.0,16.0


In [14]:
#after seeing these statistics, the least expensive car price is 1 dollar which does not make sense so we will deal with it
p=(df['Price'] < 500).sum()
print(p)
# Now we know that there is 1663 cars that have less thn 500 dollars as price value, this doesnot make sense and we will use linear regression to deal with these values (Imputation), but first we should do encoding

1663


In [15]:
#Labeled encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Manufacturer'] = le.fit_transform(df['Manufacturer'])
df['Model'] = le.fit_transform(df['Model'])
df['Category'] = le.fit_transform(df['Category'])
df['Fuel type'] = le.fit_transform(df['Fuel type'])
df['Engine volume'] = le.fit_transform(df['Engine volume'])
df['Gear box type'] = le.fit_transform(df['Gear box type'])
df['Doors'] = le.fit_transform(df['Doors'])
df['Wheel'] = le.fit_transform(df['Wheel'])
df['Color'] = le.fit_transform(df['Color'])

In [16]:
# One hot encoding

one_hot = pd.get_dummies(df['Leather interior'])
df = pd.concat([df, one_hot], axis=1)
df = df.drop('Leather interior', axis=1)
df = df.rename(columns={'No': 'No Leather interior'})
df = df.rename(columns={'Yes': 'Leather interior'})

one_hot = pd.get_dummies(df['Drive wheels'])
df = pd.concat([df, one_hot], axis=1)
df = df.drop('Drive wheels', axis=1)

In [17]:
# To make sure that after encoding, all columns have numeric datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Price                19237 non-null  int64  
 1   Manufacturer         19237 non-null  int64  
 2   Model                19237 non-null  int64  
 3   Prod. year           19237 non-null  int64  
 4   Category             19237 non-null  int64  
 5   Fuel type            19237 non-null  int64  
 6   Engine volume        19237 non-null  int64  
 7   Mileage              19237 non-null  int64  
 8   Cylinders            19237 non-null  float64
 9   Gear box type        19237 non-null  int64  
 10  Doors                19237 non-null  int64  
 11  Wheel                19237 non-null  int64  
 12  Color                19237 non-null  int64  
 13  Airbags              19237 non-null  int64  
 14  No Leather interior  19237 non-null  uint8  
 15  Leather interior     19237 non-null 

In [18]:
#Linear regression model to impute the prices of cars that have prices less than 500$

import pandas as pd
from sklearn.linear_model import LinearRegression

# create a new dataframe with only the rows where the price is less than 500
low_price_df = df[df['Price'] < 500]

# create a new dataframe with only the rows where the price is greater than or equal to 500
high_price_df = df[df['Price'] >= 500]

# fit a linear regression model to the high_price_df data
model = LinearRegression()
X = high_price_df.drop('Price', axis=1)
y = high_price_df['Price']
model.fit(X, y)

# predict the price values for the low_price_df data using the trained model
X_test = low_price_df.drop('Price', axis=1)
y_pred = model.predict(X_test)

# replace the low price values with the predicted values
df.loc[low_price_df.index, 'Price'] = y_pred


In [19]:
#Making sure that know there is no car which is recorded cheaper than 500

df = df[df['Price'] > 500]
p=(df['Price'] < 500).sum()
print(p)

0


In [20]:
#Making sure that not much cloumns where lost in the imputation process, only 72 (0.3%) records were lost which is a low number
num_rows, num_cols = df.shape
print(f'The dataframe has {num_rows} rows and {num_cols} columns.')

The dataframe has 19165 rows and 19 columns.


In [21]:
#Reprdering the columns into a more logical form
new_order=['Price','Manufacturer','Model','Color','Prod. year','Category','Fuel type','Engine volume','Mileage','Cylinders','Gear box type','Doors','Wheel','Airbags','Leather interior','No Leather interior','4x4','Front','Rear']
df=df[new_order]
df.tail()

Unnamed: 0,Price,Manufacturer,Model,Color,Prod. year,Category,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Doors,Wheel,Airbags,Leather interior,No Leather interior,4x4,Front,Rear
19232,8467.0,36,385,12,1999,1,0,37,300000,4.0,1,0,0,5,1,0,0,0,1
19233,15681.0,23,1334,11,2011,9,5,44,161600,4.0,2,1,0,8,1,0,0,1,0
19234,26108.0,23,1442,7,2010,4,1,36,116365,4.0,0,1,0,4,1,0,0,1,0
19235,5331.0,8,456,1,2007,4,1,36,51258,4.0,0,1,0,4,1,0,0,1,0
19236,8628.305885,23,1334,14,2012,9,2,44,186923,4.0,0,1,0,12,1,0,0,1,0


In [22]:
# To get a statistical view before modelling
df.describe()

Unnamed: 0,Price,Manufacturer,Model,Color,Prod. year,Category,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Doors,Wheel,Airbags,Leather interior,No Leather interior,4x4,Front,Rear
count,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0,19165.0
mean,20183.65,33.08312,862.417115,7.763423,2010.953352,6.273259,3.427446,41.510514,1537546.0,4.583981,0.537855,0.965823,0.07472,6.588155,0.727055,0.272945,0.211531,0.668823,0.119645
std,190873.2,17.777581,411.074905,5.365747,5.552149,2.789658,1.808056,15.303363,48494630.0,1.200998,0.897531,0.213876,0.262945,4.322585,0.445485,0.445485,0.408405,0.470649,0.324555
min,549.0,0.0,0.0,0.0,1943.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7840.0,21.0,537.0,1.0,2009.0,4.0,2.0,32.0,70296.0,4.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
50%,14834.0,32.0,834.0,7.0,2012.0,7.0,5.0,36.0,126000.0,4.0,0.0,1.0,0.0,6.0,1.0,0.0,0.0,1.0,0.0
75%,23521.0,54.0,1226.0,12.0,2015.0,9.0,5.0,46.0,189000.0,4.0,1.0,1.0,0.0,12.0,1.0,1.0,0.0,1.0,0.0
max,26307500.0,64.0,1589.0,15.0,2020.0,10.0,6.0,106.0,2147484000.0,16.0,3.0,2.0,1.0,16.0,1.0,1.0,1.0,1.0,1.0


# **Model number 1 (Simple decision tree)**

In [23]:
# creating a copy of the dataframe
df_encoded=df.copy()

In [24]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
from sklearn.tree import DecisionTreeRegressor 

tree_clf = DecisionTreeRegressor(random_state=1)
tree_clf.fit(X_train, y_train)
val_predictions = tree_clf.predict(X_test)

In [26]:
# calculating the MAE

from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(y_test, val_predictions)
val_mae=val_mae.round()
print(val_mae)

5777.0


# **Model number 2 (decision tree with predetermined best depth)**

In [27]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# A method to be used to know the MAEs of different sepths of decision trees

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [29]:
candidate_max_leaf_nodes = [5, 10, 15, 20, 25, 35, 45, 50, 60, 70, 80, 90, 100, 250, 350, 450, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
i=0
x= get_mae(candidate_max_leaf_nodes[i],X_train, X_test, y_train, y_test)
temp=x
pos=0
i+=1
while i<6:
    x= get_mae(candidate_max_leaf_nodes[i],X_train, X_test, y_train, y_test)
    if x<temp:
        temp=x
        pos=i
    i+=1

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = candidate_max_leaf_nodes[pos]
print("the tree size that has the lowest MAE is: ",best_tree_size)

the tree size that has the lowest MAE is:  35


In [30]:
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)
val_prediction = final_model.predict(X_test)

In [31]:
# calculating the MAE

from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(y_test, val_prediction)
val_mae=val_mae.round()
print(val_mae)

8120.0


# **Model number 3 (Random Forests)**

In [32]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(X_train, y_train)
melb_preds = rf_model.predict(X_test)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(y_test, melb_preds)
rf_val_mae=rf_val_mae.round()

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

Validation MAE for Random Forest Model: 6693.0
