In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("data/car_price_dataset.csv")
df.head()

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867


In [4]:
df.dtypes

Brand            object
Model            object
Year              int64
Engine_Size     float64
Fuel_Type        object
Transmission     object
Mileage           int64
Doors             int64
Owner_Count       int64
Price             int64
dtype: object

In [6]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
numeric_df.head()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price
0,2020,4.2,289944,3,5,8501
1,2012,2.0,5356,2,3,12092
2,2020,4.2,231440,4,2,11171
3,2023,2.0,160971,2,1,11780
4,2003,2.6,286618,3,3,2867


In [7]:
numeric_df.var()

Year           4.757825e+01
Engine_Size    1.320946e+00
Mileage        7.451548e+09
Doors          1.232315e+00
Owner_Count    2.024023e+00
Price          9.688259e+06
dtype: float64

### Using Log normalization

In [8]:
numeric_df["Mileage_log"] = np.log(numeric_df["Mileage"])
numeric_df["Price_log"] = np.log(numeric_df["Price"])
numeric_df.head()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price,Mileage_log,Price_log
0,2020,4.2,289944,3,5,8501,12.577443,9.047939
1,2012,2.0,5356,2,3,12092,8.585973,9.400299
2,2020,4.2,231440,4,2,11171,12.352076,9.321076
3,2023,2.0,160971,2,1,11780,11.98898,9.374158
4,2003,2.6,286618,3,3,2867,12.565906,7.961021


In [9]:
numeric_df.var()

Year           4.757825e+01
Engine_Size    1.320946e+00
Mileage        7.451548e+09
Doors          1.232315e+00
Owner_Count    2.024023e+00
Price          9.688259e+06
Mileage_log    1.004414e+00
Price_log      1.706055e-01
dtype: float64

## Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
numeric_df.head()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price,Mileage_log,Price_log
0,2020,4.2,289944,3,5,8501,12.577443,9.047939
1,2012,2.0,5356,2,3,12092,8.585973,9.400299
2,2020,4.2,231440,4,2,11171,12.352076,9.321076
3,2023,2.0,160971,2,1,11780,11.98898,9.374158
4,2003,2.6,286618,3,3,2867,12.565906,7.961021


In [11]:
numeric_df_subset = numeric_df[["Mileage","Price"]]
numeric_df_subset.head()

Unnamed: 0,Mileage,Price
0,289944,8501
1,5356,12092
2,231440,11171
3,160971,11780
4,286618,2867


#### Using StandardScaler to scale the features

In [18]:
scaler = StandardScaler()
numeric_df.head()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price,Mileage_log,Price_log
0,2020,4.2,289944,3,5,8501,12.577443,9.047939
1,2012,2.0,5356,2,3,12092,8.585973,9.400299
2,2020,4.2,231440,4,2,11171,12.352076,9.321076
3,2023,2.0,160971,2,1,11780,11.98898,9.374158
4,2003,2.6,286618,3,3,2867,12.565906,7.961021


In [19]:
n_df = df.select_dtypes(include=['int','float'])
n_df.head()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price
0,2020,4.2,289944,3,5,8501
1,2012,2.0,5356,2,3,12092
2,2020,4.2,231440,4,2,11171
3,2023,2.0,160971,2,1,11780
4,2003,2.6,286618,3,3,2867


In [22]:
n_df_scaled = pd.DataFrame(scaler.fit_transform(n_df),columns=(n_df.columns))
n_df_scaled.head()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price
0,1.226021,1.043657,1.630075,-0.447821,1.412122,-0.113083
1,0.066156,-0.870607,-1.666895,-1.348689,0.006256,1.040674
2,1.226021,1.043657,0.952303,0.453046,-0.696677,0.744764
3,1.66097,-0.870607,0.135915,-1.348689,-1.39961,0.940431
4,-1.238692,-0.348535,1.591543,-0.447821,0.006256,-1.923238
