<a href="https://colab.research.google.com/github/Abdi-dotcom/Machine-Learning-Project/blob/main/20029423_AML_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize

# sklearn basics
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# IMPORTS FOR ADVANCED ML
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.inspection import permutation_importance, partial_dependence, PartialDependenceDisplay

sns.set(
    { "figure.figsize": (10, 6) },
    style='ticks',
    color_codes=True,
    font_scale=0.8
)
sns.set_theme(style="whitegrid")
# Improves plot display in Jupyter Notebook
%config InlineBackend.figure_formats = set(('retina', 'svg'))

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
file_path = '/content/drive/MyDrive/Colab Notebooks/adverts_cleaned.csv'
car = pd.read_csv(file_path)


In [4]:
car.head(5)

Unnamed: 0,mileage,standard_colour,standard_make,standard_model,vehicle_condition,year_of_registration,price,body_type,crossover_car_and_van,fuel_type,...,price_per_mile,condition_encoded,crossover_flag,mileage_per_year,Model_Count,Age_x_Mileage,model_popularity,popularity_class,price_bucket,New_fuel_type
0,0.0,Grey,Volvo,XC90,NEW,2020,73970,SUV,False,Petrol Plug-in Hybrid,...,73970.0,1,0,0.0,1665,0.0,0.258347,Medium,luxury/High-Performance,Hybrid
1,108230.0,Blue,Jaguar,XF,USED,2011,7000,Saloon,False,Diesel,...,0.064676,0,0,7730.714286,1948,162.288324,0.167452,Medium,Budget Friendly,Diesel
2,7800.0,Grey,SKODA,Yeti,USED,2017,14000,SUV,False,Petrol,...,1.794642,0,0,975.0,823,71.696058,0.084755,Low,Budget Friendly,Petrol
3,45000.0,Brown,Vauxhall,Mokka,USED,2016,7995,Hatchback,False,Diesel,...,0.177663,0,0,5000.0,1325,96.42996,0.09871,Low,Budget Friendly,Diesel
4,64000.0,Grey,Land Rover,Range Rover Sport,USED,2014,26995,SUV,False,Diesel,...,0.42179,0,0,5818.181818,2904,121.733194,0.328669,High,Mid-Range,Diesel


In [5]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402005 entries, 0 to 402004
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   mileage                402005 non-null  float64
 1   standard_colour        402005 non-null  object 
 2   standard_make          402005 non-null  object 
 3   standard_model         402005 non-null  object 
 4   vehicle_condition      402005 non-null  object 
 5   year_of_registration   402005 non-null  int64  
 6   price                  402005 non-null  int64  
 7   body_type              402005 non-null  object 
 8   crossover_car_and_van  402005 non-null  bool   
 9   fuel_type              402005 non-null  object 
 10  price_winsorized       402005 non-null  int64  
 11  mileage_winsorized     402005 non-null  float64
 12  vehicle_age            402005 non-null  int64  
 13  log_mileage            402005 non-null  float64
 14  price_per_mile         402005 non-nu

In [6]:
car.describe().round(2)

Unnamed: 0,mileage,year_of_registration,price,price_winsorized,mileage_winsorized,vehicle_age,log_mileage,price_per_mile,condition_encoded,crossover_flag,mileage_per_year,Model_Count,Age_x_Mileage,model_popularity
count,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0,402005.0
mean,37743.8,2015.36,17341.97,16435.67,37509.97,9.64,9.35,1450.27,0.08,0.0,3382.36,3303.32,96.78,0.24
std,34831.27,4.39,46437.46,14463.69,33799.68,4.39,2.82,7379.39,0.27,0.07,2507.22,3125.76,58.35,0.16
min,0.0,1933.0,120.0,1000.0,0.0,5.0,0.0,0.01,0.0,0.0,0.0,1.0,0.0,0.0
25%,10480.0,2014.0,7495.0,7495.0,10480.0,7.0,9.26,0.15,0.0,0.0,1447.0,932.0,62.13,0.12
50%,28630.0,2017.0,12600.0,12600.0,28630.0,8.0,10.26,0.47,0.0,0.0,3109.12,2171.0,86.71,0.19
75%,56877.0,2018.0,20000.0,20000.0,56877.0,11.0,10.95,1.78,0.0,0.0,4916.67,5275.0,124.33,0.34
max,999999.0,2020.0,9999999.0,96201.0,144000.0,92.0,11.88,96201.0,1.0,1.0,28800.0,11583.0,980.89,0.66


In [7]:
car.nunique()

Unnamed: 0,0
mileage,80652
standard_colour,22
standard_make,110
standard_model,1168
vehicle_condition,2
year_of_registration,71
price,30578
body_type,16
crossover_car_and_van,2
fuel_type,9
