In [None]:
import pandas as pd

df = pd.read_csv('merc.csv')
df.shape

(13119, 9)

In [None]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,SLK,2005.0,5200,Automatic,63000.0,Petrol,325.0,,1.8
1,S Class,2017.0,34948,Automatic,27000.0,Hybrid,20.0,61.4,
2,,2016.0,49948,Automatic,6200.0,Petrol,555.0,28.0,5.5
3,G Class,2016.0,61948,Automatic,16000.0,Petrol,325.0,30.4,4.0
4,G Class,2016.0,73948,Automatic,4000.0,Petrol,325.0,30.1,4.0


In [None]:
df.isna().sum()

model           3794
year            3896
price              0
transmission       0
mileage         3877
fuelType           0
tax             3814
mpg             3903
engineSize      3832
dtype: int64

In [None]:
df.isna().sum() / df.shape[0]

model           0.289199
year            0.296974
price           0.000000
transmission    0.000000
mileage         0.295526
fuelType        0.000000
tax             0.290723
mpg             0.297507
engineSize      0.292095
dtype: float64

In [None]:
df_miss_row = df[df.isna().sum(axis=1) >= 5]
df_miss_row.shape

(133, 9)

In [None]:
df = df[df.isna().sum(axis=1) < 5]
df.shape

(12986, 9)

In [None]:
df[df.isna().sum(axis=1) >= 5]

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize


In [None]:
from sklearn.model_selection import train_test_split
random_state = 42

X_train, X_test = train_test_split(df, test_size=0.2, random_state=random_state)

In [None]:
from sklearn.impute import SimpleImputer

imp_const_0 = SimpleImputer(strategy='constant', fill_value=0)

zero_cols = ['mileage', 'tax']

X_train[zero_cols] = imp_const_0.fit_transform(X_train[zero_cols])
X_train.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
8498,GLC Class,2019.0,45299,Semi-Auto,2792.0,Diesel,145.0,40.4,
9102,,2018.0,19099,Semi-Auto,12321.0,Petrol,145.0,,1.5
2400,A Class,2019.0,17702,Automatic,0.0,Diesel,145.0,68.9,1.5
8294,,,21749,Semi-Auto,27178.0,Diesel,160.0,51.4,
6506,E Class,2018.0,35950,Automatic,0.0,Diesel,0.0,42.8,3.0


In [None]:
X_train[zero_cols].isna().sum()

mileage    0
tax        0
dtype: int64

In [None]:
imp_const_unknown = SimpleImputer(strategy='constant', fill_value='unknown')

unk_cols = ['model']

X_train[unk_cols] = imp_const_unknown.fit_transform(X_train[unk_cols])
X_train.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
8498,GLC Class,2019.0,45299,Semi-Auto,2792.0,Diesel,145.0,40.4,
9102,unknown,2018.0,19099,Semi-Auto,12321.0,Petrol,145.0,,1.5
2400,A Class,2019.0,17702,Automatic,0.0,Diesel,145.0,68.9,1.5
8294,unknown,,21749,Semi-Auto,27178.0,Diesel,160.0,51.4,
6506,E Class,2018.0,35950,Automatic,0.0,Diesel,0.0,42.8,3.0


In [None]:
imp_median = SimpleImputer(strategy='median')

med_cols = ['mpg', 'engineSize']

X_train[med_cols] = imp_median.fit_transform(X_train[med_cols])
X_train.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
8498,GLC Class,2019.0,45299,Semi-Auto,2792.0,Diesel,145.0,40.4,2.0
9102,unknown,2018.0,19099,Semi-Auto,12321.0,Petrol,145.0,56.5,1.5
2400,A Class,2019.0,17702,Automatic,0.0,Diesel,145.0,68.9,1.5
8294,unknown,,21749,Semi-Auto,27178.0,Diesel,160.0,51.4,2.0
6506,E Class,2018.0,35950,Automatic,0.0,Diesel,0.0,42.8,3.0


In [None]:
X_train[med_cols].isna().sum()

mpg           0
engineSize    0
dtype: int64

In [None]:
from sklearn.impute import KNNImputer
imp_knn = KNNImputer(n_neighbors=3)

num_columns = ['year', 'mileage', 'tax', 'mpg', 'engineSize']

In [None]:
X_train[num_columns] = imp_knn.fit_transform(X_train[num_columns])
X_train.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
8498,GLC Class,2019.0,45299,Semi-Auto,2792.0,Diesel,145.0,40.4,2.0
9102,unknown,2018.0,19099,Semi-Auto,12321.0,Petrol,145.0,56.5,1.5
2400,A Class,2019.0,17702,Automatic,0.0,Diesel,145.0,68.9,1.5
8294,unknown,2017.333333,21749,Semi-Auto,27178.0,Diesel,160.0,51.4,2.0
6506,E Class,2018.0,35950,Automatic,0.0,Diesel,0.0,42.8,3.0


In [None]:
X_train.isna().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [None]:
X_test[zero_cols] = imp_const_0.transform(X_test[zero_cols])
X_test[unk_cols] = imp_const_unknown.transform(X_test[unk_cols])
X_test[med_cols] = imp_median.transform(X_test[med_cols])
X_test[num_columns] = imp_knn.transform(X_test[num_columns])

In [None]:
X_train.to_csv('merc_train.csv', index=False)
X_test.to_csv('merc_test.csv', index=False)