# Step 2 Handling Missing Values

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

%matplotlib inline
# %matplotlib notebook

plt.rcParams["figure.figsize"] = (10,6)
# plt.rcParams['figure.dpi'] = 100

sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 150

In [2]:
df=pd.read_csv("Capstone_new.csv")

In [3]:
df.head(1)

Unnamed: 0,make_model,body_type,price,vat,km,hp,Type,Previous Owners,Next Inspection,Inspection new,Warranty,Paint Type,Nr. of Doors,Nr. of Seats,Gearing Type,Displacement,Cylinders,Weight,Drive chain,Fuel,CO2 Emission,Emission Class,Comfort & Convenience,Entertainment & Media,Extras,Safety & Security,Gears,age,Upholstery_type,Upholstery_color,Consumption_combined,Consumption_city,Consumption_country
0,Audi A1,Sedans,15770,VAT deductible,56013.0,66.0,Used,2.0,06/2021,Yes,,Metallic,5.0,5.0,Automatic,1422.0,3.0,1220.0,front,Diesel,99.0,Euro 6,"Air conditioning,Armrest,Automatic climate con...","Bluetooth,Hands-free equipment,On-board comput...","Alloy wheels,Catalytic Converter,Voice Control","ABS,Central door lock,Daytime running lights,D...",,2016.0,Cloth,Black,3.8,4.3,3.5


In [4]:
list_miss=[]
[list_miss.append(column) for column in df.columns if any(df[column].isnull())]
list_miss

['body_type',
 'vat',
 'km',
 'hp',
 'Type',
 'Previous Owners',
 'Next Inspection',
 'Inspection new',
 'Warranty',
 'Paint Type',
 'Nr. of Doors',
 'Nr. of Seats',
 'Displacement',
 'Cylinders',
 'Weight',
 'Drive chain',
 'CO2 Emission',
 'Emission Class',
 'Comfort & Convenience',
 'Entertainment & Media',
 'Extras',
 'Safety & Security',
 'Gears',
 'age',
 'Upholstery_type',
 'Upholstery_color',
 'Consumption_combined',
 'Consumption_city',
 'Consumption_country']

**Firstly we should fill the most helpful columns!**

In [5]:
def first_overview(x):
    print("column_name",x)
    print("**************")
    print("number_of_nulls", df[x].isnull().sum())
    print("per_of nulls", "%", round(df[x].isnull().sum()*100/df.shape[2],2))
    print(df[x].value_counts(dropna=False))

### 1) Age

In [6]:
df["age"]=2019-df["age"]

In [7]:
df.groupby("Type").age.value_counts(dropna=False)

Type            age  
Demonstration   0.000     632
                1.000     130
                2.000      27
                NaN         5
                3.000       2
Employee's car  1.000     765
                0.000     127
                2.000      99
                3.000      17
                NaN         3
New             NaN      1547
                0.000     100
                1.000       3
Pre-registered  0.000    1187
                1.000     161
                2.000       9
                NaN         6
                3.000       1
Used            3.000    3653
                1.000    3463
                2.000    3138
                0.000     807
                NaN        35
Name: age, dtype: int64

In [8]:
df["age"].value_counts(dropna=False)

1.000    4522
3.000    3674
2.000    3273
0.000    2853
NaN      1597
Name: age, dtype: int64

In [9]:
df.loc[df[(df["age"]==0.0) & (df["km"]>5000)].index,"age"]=1

### I thought if the "km" is more than 5000 age couldn't be 0

**I'd say here if it is Type-Used age could be 0 or 1**

In [10]:
df['age'].fillna('-', inplace=True)

**describe() does NOT return anything for "NaN" values. If you have "Nan" at the column you are interested in, you should replace these "nan" values with something else. In our case, we have replaced the "NaN" values with "-" so that we could implement the "describe" method.**

In [11]:
df.groupby("age").km.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,2337.0,792.502,1460.188,1.0,10.0,18.0,850.0,5000.0
1.0,4853.0,17445.066,11234.899,1.0,9000.0,16680.0,24772.0,136000.0
2.0,3272.0,41754.941,28295.748,1.0,21541.75,34752.0,54805.5,317000.0
3.0,3674.0,77442.521,39170.143,10.0,48000.0,72914.5,99950.0,291800.0
-,759.0,934.497,7416.244,0.0,5.0,10.0,10.0,89982.0


In [12]:
df['age'].replace('-', np.NaN, inplace=True)

In [13]:
df.loc[df["Type"]=="New","age"]=(df.loc[df["Type"]=="New","age"]).replace(np.NaN,0)

In [14]:
df.age.value_counts(dropna=False)

1.000    4891
0.000    4031
3.000    3674
2.000    3273
NaN        50
Name: age, dtype: int64

In [15]:
df['age'].fillna('-', inplace=True)

In [16]:
df.groupby("age").km.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,3050.0,610.19,1320.133,0.0,10.0,10.0,150.0,5000.0
1.0,4853.0,17445.066,11234.899,1.0,9000.0,16680.0,24772.0,136000.0
2.0,3272.0,41754.941,28295.748,1.0,21541.75,34752.0,54805.5,317000.0
3.0,3674.0,77442.521,39170.143,10.0,48000.0,72914.5,99950.0,291800.0
-,46.0,15223.457,26537.973,1.0,10.0,640.5,20451.0,89982.0


In [17]:
cond1=(df["km"]<10000)
cond2=((df["km"]>=10000) &(df["km"]<30000))
cond3=((df["km"]>=30000) &(df["km"]<50000))
cond4=(df["km"]>=50000)

In [18]:
df.loc[cond1,'age'] = df.loc[cond1,'age'].replace('-', 0)
df.loc[cond2,'age'] = df.loc[cond2,'age'].replace('-', 1)
df.loc[cond3,'age'] = df.loc[cond3,'age'].replace('-', 2)
df.loc[cond4,'age'] = df.loc[cond4,'age'].replace('-', 3)

In [19]:
df.age.value_counts(dropna=False)

1.0    4897
0.0    4062
3.0    3679
2.0    3277
-         4
Name: age, dtype: int64

In [20]:
df.groupby("age").km.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,3081.0,614.251,1326.434,0.0,10.0,10.0,150.0,6100.0
1.0,4859.0,17445.694,11229.33,1.0,9000.0,16692.0,24763.5,136000.0
2.0,3276.0,41748.577,28279.313,1.0,21572.5,34752.0,54755.0,317000.0
3.0,3679.0,77450.063,39145.119,10.0,48000.0,72945.0,99950.0,291800.0
-,0.0,,,,,,,


In [21]:
df["age"]=df["age"].replace("-",df["age"].mode()[0])

In [22]:
df.age.value_counts(dropna=False)

1.000    4901
0.000    4062
3.000    3679
2.000    3277
Name: age, dtype: int64

### 2) VAT

In [23]:
df.vat.value_counts(dropna=False)

VAT deductible      10980
NaN                  4513
Price negotiable      426
Name: vat, dtype: int64

In [24]:
df['vat'] = df['vat'].fillna('VAT not deductible')

In [25]:
df["vat"].value_counts(dropna=False)

VAT deductible        10980
VAT not deductible     4513
Price negotiable        426
Name: vat, dtype: int64

 ### 3) Type

In [26]:
df['Type'].value_counts(dropna=False)

Used              11096
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
NaN                   2
Name: Type, dtype: int64

In [27]:
df['Type'] = df['Type'].fillna('Used')

### 4) Km

In [28]:
df["km"].value_counts(dropna=False)

10.000       1045
NaN          1024
1.000         367
5.000         170
50.000        148
             ... 
67469.000       1
43197.000       1
10027.000       1
35882.000       1
57.000          1
Name: km, Length: 6690, dtype: int64

In [29]:
df["km"].fillna(df.groupby(["age","Type"])["km"].transform(lambda x: x.mean()),inplace=True)  #value_counts(dropna=False).head(200)

In [30]:
df["km"].isnull().sum()

0

In [31]:
df["km"].value_counts(dropna=False)

10.000       1045
16.987        848
1.000         367
5.000         170
50.000        148
             ... 
53433.000       1
67469.000       1
43197.000       1
10027.000       1
57.000          1
Name: km, Length: 6699, dtype: int64

In [32]:
#df.km=(round((df["km"]/1000),0)*1000)

### 5) Body Type

In [33]:
df.body_type.replace("Other",np.NaN,inplace=True)

In [34]:
df.body_type.value_counts(dropna=False)

Sedans           7903
Station wagon    3553
Compact          3153
Van               783
NaN               350
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64

**We do NOT have any information about "other". So we have assigned them to "NaN" to use "fillna" function**

In [35]:
df.columns

Index(['make_model', 'body_type', 'price', 'vat', 'km', 'hp', 'Type',
       'Previous Owners', 'Next Inspection', 'Inspection new', 'Warranty',
       'Paint Type', 'Nr. of Doors', 'Nr. of Seats', 'Gearing Type',
       'Displacement', 'Cylinders', 'Weight', 'Drive chain', 'Fuel',
       'CO2 Emission', 'Emission Class', 'Comfort & Convenience',
       'Entertainment & Media', 'Extras', 'Safety & Security', 'Gears', 'age',
       'Upholstery_type', 'Upholstery_color', 'Consumption_combined',
       'Consumption_city', 'Consumption_country'],
      dtype='object')

In [36]:
df["body_type"]=df["body_type"].fillna(df.groupby("make_model")["body_type"].transform(lambda x:x.mode()[0]))

In [37]:
df.body_type.value_counts(dropna=False)

Sedans           8005
Station wagon    3678
Compact          3242
Van               817
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64

### 6) Previous Owners

In [38]:
df.groupby(["age","km","Type"])["Previous Owners"].value_counts(dropna=False).head(500)

age    km        Type            Previous Owners
0.000  0.000     New             0.000               16
                                 NaN                  3
       1.000     Employee's car  NaN                  2
                 New             NaN                142
                                 0.000               13
                 Pre-registered  1.000               83
                                 NaN                 76
                 Used            NaN                 13
                                 1.000               11
       2.000     New             NaN                  2
                 Pre-registered  NaN                  2
                                 1.000                2
       3.000     Demonstration   NaN                  3
                                 1.000                1
                 New             NaN                  4
                 Pre-registered  1.000               24
                                 NaN                  1

**I think if the km is under 5 the car may be new car. It could be spended 5 km by test drive**

In [39]:
df.loc[df[(df["Type"]=="New") & (df["km"]<15)].index,"Previous Owners"]=0

In [40]:
df["Previous Owners"].value_counts(dropna=False)

1.000    8100
NaN      6242
0.000     792
2.000     766
3.000      17
4.000       2
Name: Previous Owners, dtype: int64

In [41]:
df.loc[df[df["km"]<15].index,"Previous Owners"]=0.0

In [42]:
df["Previous Owners"].value_counts(dropna=False)

1.000    7466
NaN      5711
0.000    1957
2.000     766
3.000      17
4.000       2
Name: Previous Owners, dtype: int64

In [43]:
df.loc[((df["km"]>=15) & (df["km"]<150000) & df["Previous Owners"].isnull()),"Previous Owners"]=1.0

In [44]:
df["Previous Owners"].value_counts(dropna=False)

1.000    13137
0.000     1957
2.000      766
NaN         40
3.000       17
4.000        2
Name: Previous Owners, dtype: int64

In [45]:
df["Previous Owners"].fillna(df.groupby(['age'])['Previous Owners'].transform(lambda x: x.mode()[0]), inplace=True)

In [46]:
df["Previous Owners"].value_counts(dropna=False)

1.000    13177
0.000     1957
2.000      766
3.000       17
4.000        2
Name: Previous Owners, dtype: int64

### 7) Warranty

In [47]:
df["Warranty"].fillna("-", inplace = True)

In [48]:
df.groupby(['make_model', 'age', 'Warranty']).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,age,Warranty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,0.0,12.0,33.0,23968.515,2803.633,20881.0,21000.0,23900.0,25800.0,28990.0
Audi A1,0.0,18.0,1.0,19995.0,,19995.0,19995.0,19995.0,19995.0,19995.0
Audi A1,0.0,24.0,99.0,23860.667,3347.693,14900.0,21890.0,22850.0,26890.0,32000.0
Audi A1,0.0,36.0,22.0,25204.818,3186.67,19900.0,22339.25,25400.0,28215.0,29179.0
Audi A1,0.0,48.0,17.0,24708.824,2316.263,19650.0,22800.0,25500.0,26200.0,28500.0
Audi A1,0.0,56.0,1.0,21760.0,,21760.0,21760.0,21760.0,21760.0,21760.0
Audi A1,0.0,60.0,11.0,21695.455,557.519,20990.0,21390.0,21490.0,21995.0,22990.0
Audi A1,0.0,-,541.0,23833.449,3422.677,15980.0,21406.0,22900.0,26990.0,35900.0
Audi A1,1.0,6.0,9.0,22166.667,743.303,20900.0,22400.0,22400.0,22400.0,22900.0
Audi A1,1.0,12.0,69.0,18463.913,2847.804,14220.0,16499.0,16980.0,20479.0,28960.0


**There are** **``too many nan values``** **and when we analyzed these nan values according to the price, age and make_model columns, we decided that this column does not have healthy data.**

In [49]:
df.drop("Warranty", axis=1, inplace=True)

### 8) Vat

In [50]:
df["vat"].fillna("-", inplace = True)

In [51]:
df.groupby(["make_model", "body_type", "vat"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,vat,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,Price negotiable,3.0,17631.667,1548.954,15950.0,16947.5,17945.0,18472.5,19000.0
Audi A1,Compact,VAT deductible,779.0,20018.999,4603.192,9950.0,16370.0,19990.0,22730.0,31990.0
Audi A1,Compact,VAT not deductible,257.0,16692.856,3377.345,11100.0,14990.0,15850.0,17900.0,29181.0
Audi A1,Coupe,VAT deductible,2.0,14925.0,1378.858,13950.0,14437.5,14925.0,15412.5,15900.0
Audi A1,Sedans,Price negotiable,78.0,16224.308,3545.634,10800.0,13912.5,15299.5,18112.5,33900.0
Audi A1,Sedans,VAT deductible,975.0,19370.279,4470.516,10000.0,15950.0,18900.0,22315.0,35900.0
Audi A1,Sedans,VAT not deductible,498.0,17650.865,4225.473,8999.0,14819.0,16495.0,20300.0,37900.0
Audi A1,Station wagon,VAT deductible,17.0,16747.706,2481.1,12950.0,15750.0,16290.0,16880.0,21450.0
Audi A1,Station wagon,VAT not deductible,4.0,22332.0,7763.736,13999.0,16579.75,23165.0,28917.25,28999.0
Audi A1,Van,VAT deductible,1.0,29000.0,,29000.0,29000.0,29000.0,29000.0,29000.0


**It is not appear to be a significant relationship between vat and price column. We can drop this column or fill missing values with ffill/bfill method to maintain the current proportionality.**

**I prefer to drop this column**

**Be informed that you can create different approaches to deal with missing values at** **``"vat"``** **column.**

In [52]:
df.drop("vat", axis=1, inplace=True)

### 9) Type

In [53]:
df['Type'] = df['Type'].fillna('Used')

### 10) Next Inspection 

In [54]:
df.drop(["Next Inspection"],axis=1,inplace=True)

### 11) Inspection New

In [55]:
df['Inspection new'] = df['Inspection new'].fillna('No')

### 12)  Paint Type

**According to the domain knowledge we know that; The colors do not affect the prices of cars directly, but their "paint types", such as whether they are metallic or not, can affect the price.**

In [56]:
df["Paint Type"].fillna("-", inplace = True)

In [57]:
df.groupby(["make_model", "body_type", "age", 'Paint Type']).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,Paint Type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,0.0,-,137.0,22007.73,3249.081,14900.0,19850.0,21490.0,23032.0,29181.0
Audi A1,Compact,0.0,Metallic,200.0,24680.875,3154.727,17880.0,22018.5,23970.0,27780.0,31990.0
Audi A1,Compact,0.0,Uni/basic,2.0,19888.0,0.0,19888.0,19888.0,19888.0,19888.0,19888.0
Audi A1,Compact,1.0,-,64.0,18312.562,3335.138,14220.0,15907.5,16925.0,20720.0,28990.0
Audi A1,Compact,1.0,Metallic,241.0,19506.191,3258.856,13980.0,16559.0,18910.0,21950.0,28960.0
Audi A1,Compact,2.0,-,52.0,15750.115,1218.358,12490.0,15810.0,15850.0,15850.0,21490.0
Audi A1,Compact,2.0,Metallic,108.0,17001.352,2294.777,10999.0,15450.0,15954.5,18957.5,22150.0
Audi A1,Compact,2.0,Uni/basic,1.0,17900.0,,17900.0,17900.0,17900.0,17900.0,17900.0
Audi A1,Compact,3.0,-,58.0,14007.948,1921.949,10490.0,12942.5,13820.0,15423.75,18400.0
Audi A1,Compact,3.0,Metallic,174.0,14723.218,1881.559,9950.0,13665.0,14360.0,15732.5,18900.0


In [58]:
df.groupby(["make_model", "body_type", "age"])['Paint Type'].transform(lambda x: x.mode()[0])

0        Metallic
1        Metallic
2        Metallic
3        Metallic
4        Metallic
           ...   
15914    Metallic
15915    Metallic
15916    Metallic
15917    Metallic
15918    Metallic
Name: Paint Type, Length: 15919, dtype: object

**It is not appear to be a significant relationship between Paint_Type and price column. We can drop this column or fill missing values with ffill/bfill method to maintain the current proportionality.**

**I prefer to fill missing values** 

In [59]:
df["Paint Type"].replace("-",np.NaN,inplace=True)

In [60]:
for group1 in df["make_model"].unique():
    for group2 in list(df["body_type"].unique()):
        cond2 = (df["make_model"]==group1) & (df["body_type"]==group2)
        df.loc[cond2, "Paint Type"] = df.loc[cond2, "Paint Type"].fillna(method="ffill").fillna(method="bfill")
                
for group1 in list(df["make_model"].unique()):
    cond1 = df["make_model"]==group1
    df.loc[cond1, "Paint Type"] = df.loc[cond1, "Paint Type"].fillna(method="ffill").fillna(method="bfill")            
           
df["Paint Type"] = df["Paint Type"].fillna(method="ffill").fillna(method="bfill")

In [61]:
df["Paint Type"].value_counts(dropna=False)

Metallic       15250
Uni/basic        637
Perl effect       32
Name: Paint Type, dtype: int64

### 13) Upholstery_type

In [62]:
df["Upholstery_type"].replace(np.NaN,"-",inplace=True)

In [63]:
df.groupby(["make_model", "body_type", "age", "Upholstery_type"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,Upholstery_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,0.0,-,36.0,21087.278,3159.173,16220.0,19078.0,20617.5,22900.0,29181.0
Audi A1,Compact,0.0,Cloth,288.0,23909.646,3306.663,14900.0,21390.0,22757.5,27380.0,29197.0
Audi A1,Compact,0.0,Part leather,3.0,28826.667,231.805,28560.0,28750.0,28940.0,28960.0,28980.0
Audi A1,Compact,0.0,Velour,12.0,21617.5,3732.406,17330.0,19600.0,21135.0,22490.0,31990.0
Audi A1,Compact,1.0,-,60.0,18654.833,2907.281,14500.0,16462.5,16900.0,21447.5,24450.0
Audi A1,Compact,1.0,Cloth,214.0,18917.051,3101.799,13980.0,16445.0,16980.0,21475.0,28960.0
Audi A1,Compact,1.0,Full leather,4.0,21500.0,1054.356,20400.0,20775.0,21425.0,22150.0,22750.0
Audi A1,Compact,1.0,Part leather,24.0,23279.083,2937.867,20685.0,21356.25,22170.0,23015.0,28990.0
Audi A1,Compact,1.0,Velour,2.0,15899.5,494.268,15550.0,15724.75,15899.5,16074.25,16249.0
Audi A1,Compact,1.0,alcantara,1.0,28960.0,,28960.0,28960.0,28960.0,28960.0,28960.0


**I have decided to combine upholstery_types to reduce the number of categories.**

In [64]:
df["Upholstery_type"].replace(["Velour", "alcantara", "Part leather", "Full leather"], 
                              ["Cloth", "Part/Full Leather", "Part/Full Leather", "Part/Full Leather"], inplace=True)

In [65]:
df["Upholstery_type"].value_counts(dropna=False)

Cloth                8483
-                    4871
Part/Full Leather    2565
Name: Upholstery_type, dtype: int64

In [66]:
df.groupby(["make_model", "body_type", "age", "Upholstery_type"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,Upholstery_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,0.0,-,36.0,21087.278,3159.173,16220.0,19078.0,20617.5,22900.0,29181.0
Audi A1,Compact,0.0,Cloth,300.0,23817.96,3348.153,14900.0,21267.5,22720.0,27175.0,31990.0
Audi A1,Compact,0.0,Part/Full Leather,3.0,28826.667,231.805,28560.0,28750.0,28940.0,28960.0,28980.0
Audi A1,Compact,1.0,-,60.0,18654.833,2907.281,14500.0,16462.5,16900.0,21447.5,24450.0
Audi A1,Compact,1.0,Cloth,216.0,18889.111,3101.083,13980.0,16445.0,16980.0,21465.0,28960.0
Audi A1,Compact,1.0,Part/Full Leather,29.0,23229.586,2968.355,20400.0,21285.0,21950.0,22990.0,28990.0
Audi A1,Compact,2.0,-,34.0,15516.471,1658.151,10999.0,15000.0,15450.0,15900.0,20490.0
Audi A1,Compact,2.0,Cloth,119.0,16724.513,1932.321,13890.0,15679.5,15850.0,17632.5,22150.0
Audi A1,Compact,2.0,Part/Full Leather,8.0,19409.375,2917.487,12900.0,18650.0,20900.0,21061.25,21490.0
Audi A1,Compact,3.0,-,26.0,13831.577,1789.388,10490.0,13237.5,13840.0,13990.0,17900.0


**I haven't detected significant relationship to fill missing values.**

**I prefer to fill missing values with ffill/bfill method to maintain the current proportionality.**

In [67]:
df["Upholstery_type"].replace("-", np.nan, inplace=True)

In [68]:
def fill(df, group_col1, group_col2, col_name, method): # method can be either "mode" or "mean" or "median" or "ffill"
    
    '''Fills the missing values with "mode/mean/median/ffill/bfill method" according to double-stage grouping'''
    
    if method == "mode":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                mode1 = list(df[cond1][col_name].mode())
                mode2 = list(df[cond2][col_name].mode())
                if mode2 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
                elif mode1 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
                else:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])

    elif method == "mean":
        df[col_name].fillna(df.groupby([group_col1, group_col2])[col_name].transform("mean"), inplace = True)
        df[col_name].fillna(df.groupby(group_col1)[col_name].transform("mean"), inplace = True)
        df[col_name].fillna(df[col_name].mean(), inplace = True)
        
    elif method == "median":
        df[col_name].fillna(df.groupby([group_col1, group_col2])[col_name].transform("median"), inplace = True)
        df[col_name].fillna(df.groupby(group_col1)[col_name].transform("median"), inplace = True)
        df[col_name].fillna(df[col_name].median(), inplace = True)
        
    elif method == "ffill":           
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
        for group1 in list(df[group_col1].unique()):
            cond1 = df[group_col1]==group1
            df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
        df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [69]:
fill(df, "make_model", "body_type", "Upholstery_type", "ffill")

Number of NaN :  0
------------------
Cloth                12238
Part/Full Leather     3681
Name: Upholstery_type, dtype: int64


In [70]:
df.drop("Upholstery_color", axis=1, inplace=True)

### 14) Number of Door

In [71]:
def fill_mode(df, group_col1, group_col2, col_name):
    
    for group1 in list(df[group_col1].unique()):
        for group2 in list(df[group_col2].unique()):
            cond1 = df[group_col1]==group1
            cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
            mode1 = list(df[cond1][col_name].mode())
            mode2 = list(df[cond2][col_name].mode())
            if mode2 != []:
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
            elif mode1 != []:
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
            else:
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])
    
    print("Number of NaN : ", df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [72]:
fill_mode(df, "make_model", "body_type", "Nr. of Doors")

Number of NaN :  0
------------------
5.000    11787
4.000     3079
3.000      832
2.000      219
1.000        1
7.000        1
Name: Nr. of Doors, dtype: int64


### 15) Number of Seats

In [73]:
df.columns

Index(['make_model', 'body_type', 'price', 'km', 'hp', 'Type',
       'Previous Owners', 'Inspection new', 'Paint Type', 'Nr. of Doors',
       'Nr. of Seats', 'Gearing Type', 'Displacement', 'Cylinders', 'Weight',
       'Drive chain', 'Fuel', 'CO2 Emission', 'Emission Class',
       'Comfort & Convenience', 'Entertainment & Media', 'Extras',
       'Safety & Security', 'Gears', 'age', 'Upholstery_type',
       'Consumption_combined', 'Consumption_city', 'Consumption_country'],
      dtype='object')

**The number of seats of cars changes by theirs make_models and body_types. So I have decided to fill missing values with mode value of related group.**

In [74]:
fill(df, "make_model", "body_type", "Nr. of Seats", "mode")

Number of NaN :  0
------------------
5.000    14308
4.000     1127
7.000      362
2.000      119
6.000        2
3.000        1
Name: Nr. of Seats, dtype: int64


### 16 )Cylinders

In [75]:
df["Cylinders"].fillna("-", inplace=True)

In [76]:
df.groupby(["make_model", "body_type", "age", "Cylinders"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,Cylinders,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,0.0,3.0,164.0,23833.323,3529.433,14900.0,21435.0,22875.0,27380.0,28980.0
Audi A1,Compact,0.0,4.0,18.0,25435.056,3358.39,20582.0,22453.0,25578.0,28785.0,29181.0
Audi A1,Compact,0.0,-,157.0,23086.076,3292.892,17780.0,20388.0,22290.0,25990.0,31990.0
Audi A1,Compact,1.0,3.0,152.0,19277.454,3230.954,13980.0,16585.25,18660.0,21949.25,28960.0
Audi A1,Compact,1.0,4.0,35.0,21460.8,2456.567,16100.0,20584.5,21780.0,22890.0,28880.0
Audi A1,Compact,1.0,-,118.0,18573.686,3352.531,14500.0,16445.0,16930.0,20448.5,28990.0
Audi A1,Compact,2.0,3.0,60.0,16555.55,1858.121,13990.0,15372.5,15900.0,17100.0,21400.0
Audi A1,Compact,2.0,4.0,31.0,17756.548,2534.626,13500.0,15565.0,17930.0,20223.5,21490.0
Audi A1,Compact,2.0,-,70.0,16132.371,1874.65,10999.0,15482.5,15850.0,15956.25,22150.0
Audi A1,Compact,3.0,3.0,75.0,14363.52,1942.498,9950.0,13365.0,13900.0,15545.0,18900.0


**There are other features having less missing values and giving similar insight such as "hp, weight, displacement". So I have decided to drop this column.**

In [77]:
df.drop("Cylinders", axis=1, inplace=True)

### 17) Drive chain

In [78]:
df["Drive chain"].fillna("-", inplace=True)

In [79]:
df.groupby(["make_model", "body_type", "Drive chain"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,Drive chain,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,-,352.0,17620.869,4226.116,10490.0,14990.0,15900.0,20885.75,29190.0
Audi A1,Compact,4WD,2.0,14790.0,1258.65,13900.0,14345.0,14790.0,15235.0,15680.0
Audi A1,Compact,front,685.0,20008.223,4511.348,9950.0,16430.0,19890.0,22690.0,31990.0
Audi A1,Coupe,-,2.0,14925.0,1378.858,13950.0,14437.5,14925.0,15412.5,15900.0
Audi A1,Sedans,-,561.0,17830.44,4362.321,8999.0,14900.0,16490.0,20700.0,37900.0
Audi A1,Sedans,4WD,1.0,15450.0,,15450.0,15450.0,15450.0,15450.0,15450.0
Audi A1,Sedans,front,989.0,19133.794,4441.969,10000.0,15838.0,18500.0,21999.0,32000.0
Audi A1,Station wagon,-,3.0,24593.0,7537.216,15890.0,22390.0,28890.0,28944.5,28999.0
Audi A1,Station wagon,front,18.0,16681.111,2493.673,12950.0,15000.0,16356.0,17300.0,21450.0
Audi A1,Van,front,1.0,29000.0,,29000.0,29000.0,29000.0,29000.0,29000.0


In [80]:
cond = (df['make_model'] == "Renault Duster") & (df["body_type"] == "Off-Road")

In [81]:
df.loc[cond, 'Drive chain'] = df.loc[cond, 'Drive chain'].replace('-', '4WD')

In [82]:
df["Drive chain"].value_counts(dropna=False)

front    8886
-        6826
4WD       203
rear        4
Name: Drive chain, dtype: int64

In [83]:
df["Drive chain"] = df["Drive chain"].replace('-', np.nan)

In [84]:
df["Drive chain"].value_counts(dropna=False)

front    8886
NaN      6826
4WD       203
rear        4
Name: Drive chain, dtype: int64

In [85]:
fill(df, "make_model", "body_type", "Drive chain", "mode")

Number of NaN :  0
------------------
front    15711
4WD        204
rear         4
Name: Drive chain, dtype: int64


### 18) Emission Class

In [86]:
df.drop("Emission Class", axis=1, inplace=True)

In [87]:
df.columns

Index(['make_model', 'body_type', 'price', 'km', 'hp', 'Type',
       'Previous Owners', 'Inspection new', 'Paint Type', 'Nr. of Doors',
       'Nr. of Seats', 'Gearing Type', 'Displacement', 'Weight', 'Drive chain',
       'Fuel', 'CO2 Emission', 'Comfort & Convenience',
       'Entertainment & Media', 'Extras', 'Safety & Security', 'Gears', 'age',
       'Upholstery_type', 'Consumption_combined', 'Consumption_city',
       'Consumption_country'],
      dtype='object')

### 19) Gears

In [88]:
df["Gears"].fillna("-", inplace=True)

In [89]:
df.groupby(["make_model", "body_type", "Gearing Type", "Gears"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,Gearing Type,Gears,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,Automatic,5.0,3.0,22184.333,3421.302,18497.0,20648.5,22800.0,24028.0,25256.0
Audi A1,Compact,Automatic,6.0,6.0,21038.333,4282.039,16430.0,18725.0,20920.0,21060.0,28860.0
Audi A1,Compact,Automatic,7.0,199.0,22059.251,3918.116,13990.0,18970.0,21790.0,24365.0,29181.0
Audi A1,Compact,Automatic,8.0,1.0,16880.0,,16880.0,16880.0,16880.0,16880.0,16880.0
Audi A1,Compact,Automatic,-,253.0,21640.427,4965.5,13880.0,16975.0,20950.0,26980.0,29197.0
Audi A1,Compact,Manual,5.0,277.0,16329.469,3040.933,9950.0,13990.0,15900.0,16940.0,22990.0
Audi A1,Compact,Manual,6.0,77.0,20538.299,2061.004,12550.0,19588.0,20881.0,21990.0,22989.0
Audi A1,Compact,Manual,-,220.0,16756.727,3182.991,10490.0,14430.0,15880.0,19032.5,22990.0
Audi A1,Compact,Semi-automatic,7.0,3.0,24028.333,7208.44,17945.0,20047.5,22150.0,27070.0,31990.0
Audi A1,Coupe,Manual,5.0,1.0,13950.0,,13950.0,13950.0,13950.0,13950.0,13950.0


**The number of gears of cars changes by theirs make_models, body_types and gear_types most of the time. So I have decided to fill missing values with mode value of related group.**

In [90]:
df["Gears"].replace("-", np.nan, inplace=True)

In [91]:
for group1 in df["make_model"].unique():
    for group2 in df["body_type"].unique():
        for group3 in df["Gearing Type"].unique():
            cond1 = df["make_model"]==group1
            cond2 = (df["make_model"]==group1) & (df["body_type"]==group2)
            cond3 = (df["make_model"]==group1) & (df["body_type"]==group2) & (df["Gearing Type"]==group3)
            mode1 = list(df[cond1]["Gears"].mode())
            mode2 = list(df[cond2]["Gears"].mode())
            mode3 = list(df[cond3]["Gears"].mode())
            if mode3 != []:
                df.loc[cond3, "Gears"] = df.loc[cond3, "Gears"].fillna(df[cond3]["Gears"].mode()[0])
            elif mode2 != []:
                df.loc[cond3, "Gears"] = df.loc[cond3, "Gears"].fillna(df[cond2]["Gears"].mode()[0])
            elif mode1 != []:
                df.loc[cond3, "Gears"] = df.loc[cond3, "Gears"].fillna(df[cond1]["Gears"].mode()[0])
            else:
                df.loc[cond3, "Gears"] = df.loc[cond3, "Gears"].fillna(df["Gears"].mode()[0])

In [92]:
df["Gears"].value_counts(dropna=False)

6.000    8615
5.000    4256
7.000    2810
8.000     225
9.000       6
1.000       2
3.000       2
4.000       2
2.000       1
Name: Gears, dtype: int64

### 20)  hp

In [93]:
df["hp"].fillna("-", inplace=True)

In [94]:
df.groupby(["make_model", "body_type","hp"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,hp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,60.0,58.0,15189.828,1663.611,10900.0,14390.0,15774.5,16345.0,16978.0
Audi A1,Compact,66.0,162.0,15398.21,1934.301,10490.0,14042.5,15465.0,16445.0,23700.0
Audi A1,Compact,70.0,332.0,17983.262,3315.814,9950.0,15480.0,17447.0,21190.0,28990.0
Audi A1,Compact,71.0,32.0,20831.594,2410.84,15890.0,18937.5,21425.0,22462.25,25256.0
Audi A1,Compact,85.0,330.0,22604.103,4572.568,11100.0,19700.0,22497.0,26980.0,31990.0
Audi A1,Compact,86.0,1.0,14295.0,,14295.0,14295.0,14295.0,14295.0,14295.0
Audi A1,Compact,92.0,90.0,18029.056,3027.431,12550.0,15850.0,16935.0,20624.25,28880.0
Audi A1,Compact,93.0,2.0,21447.5,774.282,20900.0,21173.75,21447.5,21721.25,21995.0
Audi A1,Compact,110.0,20.0,23620.45,3777.344,15490.0,21299.5,23325.0,26932.5,28980.0
Audi A1,Compact,141.0,2.0,22495.0,841.457,21900.0,22197.5,22495.0,22792.5,23090.0


**The hp of cars changes by theirs make_models and body_types most of the time. So I have decided to fill missing values with mode value of related group.**

In [95]:
df["hp"].replace(["-"], np.nan, inplace=True)

In [96]:
fill(df, "make_model", "body_type", "hp", "mode")

Number of NaN :  0
------------------
85.000     2543
66.000     2124
81.000     1403
100.000    1314
110.000    1113
70.000      890
125.000     711
51.000      696
55.000      589
118.000     550
92.000      466
121.000     392
147.000     380
77.000      353
56.000      294
54.000      276
103.000     253
87.000      232
165.000     194
88.000      177
60.000      160
162.000      98
74.000       81
96.000       72
71.000       59
101.000      47
67.000       40
154.000      39
122.000      35
119.000      30
164.000      27
135.000      24
52.000       22
82.000       22
1.000        20
78.000       20
294.000      18
146.000      18
141.000      16
57.000       10
120.000       8
104.000       8
112.000       7
191.000       7
155.000       6
117.000       6
184.000       5
65.000        4
90.000        4
76.000        4
168.000       3
98.000        3
149.000       3
80.000        3
93.000        3
167.000       2
228.000       2
53.000        2
143.000       2
150.000       2
14

### 21) Displacement_cc - Weight_kg

In [97]:
fill(df, "make_model", "body_type", "Displacement", "mode")

Number of NaN :  0
------------------
1598.000     4871
999.000      2446
1398.000     1326
1399.000      753
1229.000      678
1956.000      670
1461.000      638
1490.000      559
1422.000      467
1197.000      355
898.000       351
1395.000      320
1968.000      301
1149.000      288
1600.000      258
1618.000      212
1798.000      210
1498.000      196
1400.000      136
1248.000      110
1997.000      103
1364.000      102
1500.000       96
998.000        72
2000.000       68
1000.000       56
1200.000       51
1300.000       38
1998.000       25
1.000          22
2480.000       20
1984.000       18
1397.000       11
899.000        11
900.000         9
160.000         6
1499.000        5
929.000         5
1596.000        4
997.000         4
1199.000        3
1599.000        3
139.000         3
1396.000        3
1589.000        2
995.000         2
1495.000        2
1580.000        1
890.000         1
1995.000        1
1333.000        1
1100.000        1
54.000          1
1533.000

In [98]:
fill(df, "make_model", "body_type", "Weight", "mode")

Number of NaN :  0
------------------
1163.000    1582
1360.000    1419
1487.000     966
1135.000     837
1425.000     744
1180.000     694
1273.000     656
1165.000     603
1503.000     561
1734.000     556
1087.000     291
1335.000     242
1365.000     211
1199.000     205
1350.000     156
1119.000     153
1355.000     136
1260.000     127
1280.000     127
1275.000     112
1278.000     110
1255.000     108
1200.000     107
1522.000     103
1659.000     102
1195.000      96
1120.000      93
1403.000      91
1701.000      87
1250.000      85
1685.000      83
1441.000      82
1308.000      80
1285.000      80
1613.000      75
1110.000      75
1279.000      72
1364.000      70
1345.000      67
1733.000      65
1325.000      64
1141.000      64
1209.000      64
1071.000      64
1230.000      63
1845.000      56
1090.000      54
1052.000      53
1154.000      52
1664.000      52
1513.000      51
1065.000      50
1237.000      49
1440.000      46
1088.000      46
1205.000      46
1265.000  

### 22) CO2 Emission

In [99]:
df["CO2 Emission"].fillna("-", inplace=True)

In [100]:
df.groupby(["make_model", "body_type", "Fuel", "CO2 Emission"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,Fuel,CO2 Emission,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,Benzine,97.0,77.0,15115.636,1746.383,10900.0,13820.0,15444.0,16669.0,18775.0
Audi A1,Compact,Benzine,98.0,23.0,17077.565,3010.233,13999.0,15290.0,16100.0,16900.0,25256.0
Audi A1,Compact,Benzine,100.0,3.0,15903.333,1859.901,14220.0,14905.0,15590.0,16745.0,17900.0
Audi A1,Compact,Benzine,102.0,106.0,17401.34,3100.205,9950.0,15772.5,16950.0,19889.25,29150.0
Audi A1,Compact,Benzine,103.0,30.0,21233.033,1794.551,18350.0,19225.0,22189.5,22448.5,23550.0
Audi A1,Compact,Benzine,104.0,92.0,18070.522,3280.132,11445.0,15727.25,17175.0,21390.0,22730.0
Audi A1,Compact,Benzine,105.0,12.0,22462.333,620.017,20980.0,22490.0,22510.0,22936.75,22990.0
Audi A1,Compact,Benzine,106.0,26.0,22064.192,1048.442,18880.0,22010.0,22400.0,22687.25,22990.0
Audi A1,Compact,Benzine,107.0,7.0,16771.429,2243.126,14470.0,14920.0,15830.0,18685.0,19890.0
Audi A1,Compact,Benzine,108.0,39.0,23943.795,3148.983,17330.0,21160.0,23700.0,26582.5,29190.0


In [101]:
df["CO2 Emission"].replace("-",np.NaN,inplace=True)

In [102]:
df['CO2 Emission'] = df['CO2 Emission'].fillna(df.groupby(['make_model','Displacement'])['CO2 Emission'].transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan))

In [103]:
df['CO2 Emission'] = df['CO2 Emission'].fillna(df.groupby(['body_type'])['CO2 Emission'].transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan))

In [104]:
df['CO2 Emission'] = df['CO2 Emission'].fillna(df.groupby(['body_type'])['CO2 Emission'].transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan))

In [105]:
df["CO2 Emission"].value_counts(dropna=False)

120.000    1116
97.000      836
99.000      753
114.000     594
119.000     581
102.000     532
104.000     501
85.000      464
107.000     458
103.000     445
124.000     426
128.000     414
106.000     377
117.000     362
108.000     362
127.000     302
126.000     288
110.000     286
150.000     282
113.000     272
118.000     270
140.000     245
111.000     237
139.000     236
109.000     234
129.000     215
141.000     209
105.000     205
135.000     204
92.000      198
112.000     186
130.000     180
123.000     178
143.000     171
145.000     167
134.000     165
95.000      161
116.000     157
136.000     152
98.000      151
133.000     145
153.000     139
137.000     133
125.000     132
149.000     117
147.000     113
101.000     105
132.000     100
115.000      91
122.000      83
121.000      82
138.000      75
93.000       66
168.000      58
142.000      58
100.000      56
187.000      53
148.000      48
131.000      48
144.000      40
154.000      40
146.000      37
94.000  

In [106]:
df.columns

Index(['make_model', 'body_type', 'price', 'km', 'hp', 'Type',
       'Previous Owners', 'Inspection new', 'Paint Type', 'Nr. of Doors',
       'Nr. of Seats', 'Gearing Type', 'Displacement', 'Weight', 'Drive chain',
       'Fuel', 'CO2 Emission', 'Comfort & Convenience',
       'Entertainment & Media', 'Extras', 'Safety & Security', 'Gears', 'age',
       'Upholstery_type', 'Consumption_combined', 'Consumption_city',
       'Consumption_country'],
      dtype='object')

### 23) Comfort_Convenience-Entertainment_Media-Extras-Safety_Security

In [107]:
fill(df, "make_model", "body_type", "Comfort & Convenience", "mode")

Number of NaN :  0
------------------
Air conditioning,Electrical side mirrors,Hill Holder,Power windows                                                                                                                                                                                                                                                                                                                                                                                                                                   388
Air conditioning,Armrest,Automatic climate control,Cruise control,Electrical side mirrors,Leather steering wheel,Light sensor,Lumbar support,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors front,Parking assist system sensors rear,Power windows,Rain sensor,Seat heating,Start-stop system                                                                                                                                       

In [108]:
fill(df, "make_model", "body_type", "Entertainment & Media", "mode")

Number of NaN :  0
------------------
Bluetooth,Hands-free equipment,On-board computer,Radio,USB                                                        1738
Bluetooth,Hands-free equipment,MP3,On-board computer,Radio,USB                                                    1134
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB                                          1010
On-board computer                                                                                                  615
Radio                                                                                                              558
Bluetooth,Hands-free equipment,On-board computer,Radio                                                             515
On-board computer,Radio                                                                                            487
Bluetooth,CD player,Hands-free equipment,On-board computer,Radio,USB                                               466
Bluetooth,

In [109]:
fill(df, "make_model", "body_type", "Extras", "mode")

Number of NaN :  0
------------------
Alloy wheels                                                                                                                                   5786
Alloy wheels,Touch screen                                                                                                                       697
Roof rack                                                                                                                                       596
Alloy wheels,Voice Control                                                                                                                      582
Alloy wheels,Touch screen,Voice Control                                                                                                         544
Alloy wheels,Roof rack                                                                                                                          529
Alloy wheels,Sport seats                                                  

In [110]:
fill(df, "make_model", "body_type", "Safety & Security", "mode")

Number of NaN :  0
------------------
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                                                                                                      729
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                                                                                                                 480
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,LED Daytime Running Lights,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction contro

### 24) Consumption Combined

In [111]:
df['Consumption_combined'].value_counts(dropna=False)

NaN       2033
5.400      770
3.900      733
4.000      713
5.100      657
4.400      623
5.600      618
4.700      602
3.800      585
4.800      546
5.000      545
4.500      523
5.200      454
4.200      435
4.600      426
4.900      393
5.500      380
5.300      380
3.700      369
5.900      369
4.100      342
5.700      342
6.000      331
4.300      307
3.300      307
3.500      288
6.200      216
3.600      194
6.300      181
6.100      175
5.800      164
6.600      148
6.800      136
3.400      106
6.400       75
3.000       69
7.400       66
6.500       43
6.700       43
7.100       38
10.000      34
6.900       27
3.200       25
8.300       20
7.600       14
7.000       10
3.100        7
7.200        6
7.800        6
8.000        5
51.000       4
8.600        4
7.900        3
8.700        3
1.600        3
7.300        2
8.100        2
40.000       2
38.000       2
0.000        2
9.100        1
43.000       1
7.500        1
13.000       1
55.000       1
54.000       1
1.200     

In [112]:
coms = ((df['Consumption_city']+df['Consumption_country'])/2)

In [113]:
df['Consumption_combined'] = df['Consumption_combined'].fillna(coms)

In [114]:
mode_Cons= df.groupby(['CO2 Emission'])['Consumption_combined']\
              .transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)

In [115]:
df['Consumption_combined'] = df['Consumption_combined'].fillna(mode_Cons)

In [116]:
mode_Cons1= df.groupby(['Displacement'])['Consumption_combined']\
              .transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)                


In [117]:
df['Consumption_combined'] = df['Consumption_combined'].fillna(mode_Cons1)

In [118]:
mode_Cons2= df.groupby(['make_model'])['Consumption_combined']\
              .transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)

In [119]:
df['Consumption_combined'] = df['Consumption_combined'].fillna(mode_Cons2)

In [120]:
df['Consumption_combined'].isnull().sum()

0

In [121]:
df.drop(["Consumption_country","Consumption_city"],axis=1,inplace=True)

In [122]:
df.drop("hp",axis=1,inplace=True)

In [123]:
df.to_csv("missings_filled.csv", index=False)

In [124]:
df.isnull().sum()

make_model               0
body_type                0
price                    0
km                       0
Type                     0
Previous Owners          0
Inspection new           0
Paint Type               0
Nr. of Doors             0
Nr. of Seats             0
Gearing Type             0
Displacement             0
Weight                   0
Drive chain              0
Fuel                     0
CO2 Emission             0
Comfort & Convenience    0
Entertainment & Media    0
Extras                   0
Safety & Security        0
Gears                    0
age                      0
Upholstery_type          0
Consumption_combined     0
dtype: int64