# `Used Cars Price Prediction`

## 1. Importing Libraries & Data Understanding

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder , StandardScaler , LabelEncoder , MinMaxScaler , RobustScaler
from category_encoders import BinaryEncoder
from imblearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score 
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay
from sklearn.model_selection import cross_validate , train_test_split , StratifiedKFold , GridSearchCV , RandomizedSearchCV
from sklearn import set_config
from xgboost import XGBRegressor
import joblib

In [None]:
df=pd.read_csv('vehicles.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

## 2. Data Cleaning

#### 2.1 Check Duplicates

In [None]:
df.drop('id',axis=1,inplace=True)
df.duplicated().sum()

#### 2.2 Drop Unnecessary cols

In [None]:
df.drop(['url','region_url','VIN','county','image_url','description'],axis=1,inplace=True)

#### 2.3 Check Nulls

In [None]:
df.isnull().mean()*100

In [None]:
df.drop(['size'],axis=1,inplace=True)

##### 2.3.2 Drop Null Values

In [None]:
df.dropna(subset=['manufacturer','year','model','fuel','odometer','title_status','transmission'],axis=0,inplace=True)

In [None]:
df.isnull().mean()*100

#### 2.4 check on varation for each column

In [None]:
for col in df.select_dtypes('object').columns :
    print(f'{col} -- {df[col].nunique()}')

In [None]:
df['region'].unique()

In [None]:
df['model'].unique().tolist()

In [None]:
df.drop(['region','model'],axis=1,inplace=True)

In [None]:
for col in df.select_dtypes('object').columns :
    print(f'{col} -- {df[col].nunique()}')

In [None]:
df.dropna(subset=['posting_date'],axis=0,inplace=True)

In [None]:
df['posting_date'] =pd.to_datetime(df['posting_date'],utc=True)

In [None]:
df['Year']=df['posting_date'].dt.year
df['month'] = df['posting_date'].dt.month
df['day']=df['posting_date'].dt.day

In [None]:
df['Year'].unique()

In [None]:
df['month'].unique()

In [None]:
df['day'].unique()

In [None]:
df.drop(['posting_date','Year','month','day'],axis=1 , inplace=True)

In [None]:
df

In [None]:
df.info()

In [None]:
df['paint_color'].value_counts()

In [None]:
px.histogram(data_frame=df,x='paint_color')

In [None]:
df.drop('paint_color',axis=1,inplace=True)

In [None]:
df.info()

##### I will drop also lat and long cause they are useless if i get region and state of the car

In [None]:
df.drop(['lat','long'],axis=1,inplace=True)

In [None]:
df.info()

### Dealing with null values

In [None]:
df['condition'].value_counts()

In [None]:
imputer=SimpleImputer(strategy='most_frequent')
df['condition'] = imputer.fit_transform(df[['condition']])

In [None]:
df['cylinders'].value_counts()

In [None]:
imputer=SimpleImputer(strategy='most_frequent')
df['cylinders'] = imputer.fit_transform(df[['cylinders']])

In [None]:
df['drive'].value_counts()

In [None]:
df['type'].value_counts()

##### FWD is a drivetrain configuration where the engine's power is transmitted to the front wheels of the vehicle, pulling the vehicle forward. This two-wheel drive setup is commonly found in compact cars, sedans, and smaller SUVs, with the engine sitting just above the wheels it’s powering. FWD vehicles offer several benefits that make them an attractive choice for many buyers.
###### All-Wheel Drive (AWD or 4WD) is a drivetrain configuration where power is distributed to all four wheels of the vehicle. AWD is commonly found in SUVs and crossovers, but even sedans are now offering the option for better traction when road conditions are suboptimal.

In [None]:
df.reset_index(drop=True,inplace=True)
for i in df[ (df['type']  == 'sedan')  ]['drive'].index :
    df.iat[i,df.columns.to_list().index('drive')] = 'fwd'

In [None]:
imputer= SimpleImputer(strategy='constant',fill_value='4wd')
df['drive'] = imputer.fit_transform(df[['drive']])

In [None]:
df.info()

In [None]:
imputer =SimpleImputer(strategy='most_frequent')
df['type'] = imputer.fit_transform(df[['type']])

In [None]:
df['year'] = pd.to_numeric(df['year'],downcast='integer')
df['odometer'] = pd.to_numeric(df['odometer'],downcast='integer')

## 3. EDA

#### 3.1 The distribution for each column

##### 3.1.1 let's start with price 

### As we can see from <a href="https://www.motor1.com/features/308149/most-expensive-new-cars-ever/">This Website</a> that the most expensive car around 30 million so we can consider thar more than 200,000 is outliers
### and we can see from <a href="https://www.hotcars.com/world-cheapest-new-cars/#minghong-s1-pro---1-265">This Website</a> that the cheapest car around 975 so we can consider that less than 900 is outliers also

In [None]:
df.drop(df[(df['price'] >200000) | (df['price'] <900) ].index,axis=0,inplace=True)

In [None]:
px.box(data_frame=df,x='price')

##### 3.1.1 year 

In [None]:
px.histogram(data_frame = df, x= 'year')

In [None]:
df.drop(df[df['year'] < 1960].index,axis=0,inplace=True)

In [None]:
px.histogram(data_frame = df, x= 'year',nbins=61)

##### 3.1.3 Odometer

In [None]:
px.box(data_frame=df,x='odometer')

### As we can see from <a href="https://www.caranddriver.com/research/a32758625/how-many-miles-does-a-car-last/">This Website</a> Standard cars in this day and age are expected to keep running up to 200,000 miles, while cars with electric engines are expected to last for up to 300,000 miles.

In [None]:
df.drop(df[df['odometer'] > 300000].index,axis=0,inplace=True)

In [None]:
px.box(data_frame=df,x='odometer')

### As we can see from <a href="https://newyork.craigslist.org/search/cta?max_auto_miles=1&min_auto_miles=0#search=1~gallery~0~0">This Website</a> that when odometer is equal to zero it is when the odometer is not on the website so when odometer is equal to zero will be dropped

In [None]:
df.drop(df[df['odometer'] ==0].index,axis=0,inplace=True)

In [None]:
px.box(data_frame=df,x='odometer')

##### 3.1.4 Manufacturer

In [None]:
px.histogram(data_frame=df,x='manufacturer')

##### 3.1.5 Condition

In [None]:
px.histogram(data_frame=df,x='condition')

In [None]:
df.reset_index(drop=True,inplace=True)
for i in df[df['condition'] == 'new'].index :
    df.iat[i,df.columns.to_list().index('condition')] = 'like new'

In [None]:
df.reset_index(drop=True,inplace=True)
for i in df[df['condition'] == 'salvage'].index :
    df.iat[i,df.columns.to_list().index('condition')] = 'fair'

In [None]:
px.histogram(data_frame=df,x='condition')

##### 3.1.5 cylinders

In [None]:
df['cylinders'].value_counts()

In [None]:
px.histogram(data_frame=df,x='cylinders',text_auto=True)

##### 3.1.6 Fuel

In [None]:
df['fuel'].value_counts()

In [None]:
px.histogram(data_frame=df,x='fuel',text_auto=True)

##### 3.1.7 Title Status

In [None]:
df['title_status'].value_counts()

###### 3.1.7.1  Meaning of every columns value 
<li>Clean : A clean title means that there are no liens against the vehicle, but it means more than that.</li>
<li>lien : When a vehicle suffers damage, the insurance company evaluates the cost of repairing the damage and compares it to the value of the vehicle. Sometimes the damage to the vehicle costs more to repair than its value or more than a set percentage of the value. This is what we mean when we say that a vehicle was “totaled.” The insurance will only pay up to the value of the vehicle, not the amount needed to repair it. Once the owner has received payment, the vehicle becomes the property of the insurance company. After deeming a car a total loss, the insurance company usually reports it to the local DMV (Department of Motor Vehicles). Although the process differs in some states, this usually results in the car getting a salvage title. In addition to having no liens, a vehicle with a clean title also has no salvage title.</li>
<li>salvage : Many people believe that a totaled car has suffered damage beyond repair. In reality, the salvage classification depends on the cost of repairs in comparison to the vehicle’s value. Some cars suffer so much damage that they are “unrepairable”. As such, they can only legally be sold for parts or as scrap. A salvage vehicle is a good choice for some people, but not for everyone. These salvage cars do not have the higher prices of cars with a clean title. If you have access to cheap repairs or can do them yourself, you probably can restore the vehicle to drivable condition. Once you repair the vehicle, you may apply for a new “rebuilt” title. However, the vehicle will never have a clean title.</li>
<li>parts_only : “parts only” it’s likely the seller simply doesn’t think it can be repaired, but it’s possible that the car was issued a junk title that cannot legally be driven ever again.
<li>missing : the title status is certificate is missing from the seller</li>

In [None]:
df.groupby('title_status').mean()

In [None]:
px.histogram(data_frame=df,x='title_status')

##### 3.1.8 Transmission

In [None]:
df['transmission'].value_counts()

In [None]:
px.histogram(data_frame=df,x='transmission')

###### other : that means that the custome doesnot say specifically either automatic or manual but say like CVT  (Continuously variable transmission) it a type from automatic transmission or didnot mention the  transmission type at all

##### 3.1.9 Drive

In [None]:
df['drive'].value_counts()

In [None]:
px.histogram(data_frame=df,x='drive')

##### 3.1.10 Type

In [None]:
df['type'].value_counts()

In [None]:
px.histogram(data_frame=df,x='type')

##### 3.1.11 State

In [None]:
df['state'].value_counts()

In [None]:
px.histogram(data_frame=df,x='state')

In [None]:
df.drop('state',axis=1,inplace=True)

## 4.Modeling

In [None]:
x , y = df.drop(['price'] , axis = 1) , df['price']

In [None]:
for col in x.select_dtypes('object').columns :
    
    print(f'{col} -- {x[col].nunique()}')

In [None]:
small_cat_df = make_pipeline(OneHotEncoder(sparse=False , drop = 'first'))
big_cat_df = make_pipeline(BinaryEncoder())

In [None]:
Transform = ColumnTransformer(transformers=[
    ('small_cat' ,small_cat_df ,['condition','cylinders','fuel','title_status','transmission','drive']  ),
    ('big_cat' , big_cat_df , ['manufacturer','type'])
] , remainder='passthrough')

##### 4.1 Linear Regression

In [None]:
LR = make_pipeline(Transform ,
                   PolynomialFeatures(degree=1),
                  StandardScaler(),
                  LinearRegression(n_jobs=10))

In [None]:
scores = cross_validate(estimator=LR , X = x , y = y , scoring='r2' , return_train_score=True,cv = 5)

In [None]:
scores['train_score'].mean()

In [None]:
scores['test_score'].mean()

##### 4.2 DecisionTree

In [None]:
DT = make_pipeline(Transform ,
                   PolynomialFeatures(degree=1),
                  StandardScaler(),
                  DecisionTreeRegressor(min_samples_leaf=5,splitter='best',min_samples_split=10))

In [None]:
DT_params = [
    {
        'decisiontreeregressor__max_depth' : [15,16,17],
    }
]

In [None]:
DTR = GridSearchCV(estimator=DT , param_grid=DT_params , scoring='r2' , 
                        cv = 5 , return_train_score=True)

In [None]:
DTR.fit(x , y)

In [None]:
DTR.best_estimator_

In [None]:
DTR.best_score_

In [None]:
DTR.cv_results_['mean_train_score']

In [None]:
DTR.cv_results_['mean_test_score']

##### 4.3 RandomForest

In [None]:
RF = make_pipeline(Transform ,
                   PolynomialFeatures(degree=1),
                  StandardScaler(),
                  RandomForestRegressor(max_depth=14,min_samples_split=7,min_samples_leaf=3))

In [None]:
scores = cross_validate(estimator=RF , X = x , y = y , scoring='r2' , return_train_score=True,cv = 5)

In [None]:
scores['train_score'].mean()

In [None]:
scores['test_score'].mean()

#### 4.4 XGBoost

In [None]:
XGB = make_pipeline(Transform ,
                   PolynomialFeatures(degree=1),
                  StandardScaler(),
                  XGBRegressor())

In [None]:
scores = cross_validate(estimator=XGB , X = x , y = y , scoring='r2' , return_train_score=True,cv = 5)

In [None]:
scores['train_score'].mean()

In [None]:
scores['test_score'].mean()

In [None]:
XGB.fit(x,y)

In [None]:
XGB.predict(pd.DataFrame(data=[[2014,'gmc','good','8 cylinders','gas',57923,'clean','other','4wd','pickup']],columns=df.columns[1:]))[0]

In [None]:
joblib.dump(XGB,'XGB')

In [None]:
df.to_csv('Cleaned_Vehicles_Data')