# Predict cost of used cars - Preprocessing

In [54]:
import warnings
warnings.filterwarnings('ignore')

import random
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import keras
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

import xgboost as xgb
import lightgbm as lgbm
import scipy
random.seed(20)

In [55]:
train_data = pd.read_excel('Participants_Data_Used_Cars/Data_Train.xlsx')
test_data = pd.read_excel('Participants_Data_Used_Cars/Data_Test.xlsx')

# Experiment 1 - 0.9053

## Preprocessing

In [4]:
X_train = train_data.drop(columns='Price')
y_train = train_data['Price']

X_test = test_data

In [5]:
X_train['Engine'] = X_train['Engine'].str[:-3].replace(np.nan, -1).astype('int').replace(-1, np.nan)
X_test['Engine'] = X_test['Engine'].str[:-3].replace(np.nan, -1).astype('int').replace(-1, np.nan)

In [6]:
X_train['Power'] = X_train['Power'].str[:-4].replace(np.nan, -1).replace('null', -1).astype('float').replace(-1, np.nan)
X_test['Power'] = X_test['Power'].str[:-4].replace(np.nan, -1).replace('null', -1).astype('float').replace(-1, np.nan)

### Drop columns

In [7]:
X_train = X_train.drop(columns=['New_Price', 'Mileage'])
X_test = X_test.drop(columns=['New_Price', 'Mileage'])

In [8]:
X_train.isnull().sum()

Name                   0
Location               0
Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Engine                36
Power                143
Seats                 42
dtype: int64

In [9]:
X_test.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Engine               10
Power                32
Seats                11
dtype: int64

### Impute missing values

In [11]:
def fill_engine(x):
    return x.sum()/len(x)

X_train['Engine'] = X_train.groupby(['Name'])['Engine'].transform(lambda x: x.fillna(fill_engine(x)))

values = X_test[X_test['Engine'].isnull()]['Name'].values
X_test['Engine'].ix[X_test['Engine'].isnull()] = list(X_train.groupby('Name')['Engine'].mean()[values].values)

X_test['Engine'] = X_test['Engine'].fillna(X_train['Engine'].mean())

In [12]:
def fill_power(x):
    return x.sum()/len(x)

X_train['Power'] = X_train.groupby(['Name'])['Power'].transform(lambda x: x.fillna(fill_power(x)))

values = X_test[X_test['Power'].isnull()]['Name'].values
X_test['Power'].ix[X_test['Power'].isnull()] = list(X_train.groupby('Name')['Power'].mean()[values].values)

X_test['Power'] = X_test['Power'].fillna(X_train['Power'].mean())

In [13]:
def fill_seats(x, X_train):
    try:
        return x.mode()[0]
    except:
        return X_train['Seats'].mode()[0]
    
X_train['Seats'] = X_train.groupby(['Name'])['Seats'].transform(lambda x: x.fillna(fill_seats(x, X_train)))
X_test['Seats'] = X_test['Seats'].fillna(X_train['Seats'].mode()[0])

### Label Encoding

In [16]:
object_columns = ['Name', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type']

for col in object_columns:
    label_encoder = le.fit(list(X_train[col]) + list(X_test[col]))
    X_train[col] = label_encoder.transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])

## Modeling

### XGBoost

In [21]:
XGB_model = xgb.XGBRegressor()

In [22]:
XGB_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Predict and Save

In [23]:
y_pred = XGB_model.predict(X_test)

In [24]:
y_pred

array([ 3.740502 ,  2.8587976, 16.547705 , ...,  3.5477343,  4.9571767,
       20.1998   ], dtype=float32)

In [25]:
y_pred_df = pd.DataFrame({'Price': y_pred})

In [29]:
y_pred_df.to_excel('Submissions/1_Predictions.xlsx', index=False)

0.9053

# Experiment 2 - 0.9040

## Modeling

### XGBoost

In [19]:
XGB_model = xgb.XGBRegressor(n_estimators = 100, 
                            max_depth = 3,
                            colsample_bylevel = 0.9,
                            learning_rate = 0.1,
                            random_state=12)

In [20]:
XGB_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=12,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Predict and Save

In [21]:
y_pred = XGB_model.predict(X_test)

In [22]:
y_pred

array([ 3.8048656,  2.933293 , 16.270998 , ...,  3.7403195,  5.431561 ,
       19.583801 ], dtype=float32)

In [23]:
y_pred_df = pd.DataFrame({'Price': y_pred})

In [24]:
y_pred_df.to_excel('Submissions/2_Predictions.xlsx', index=False)

0.9040

# Experiment 3 - 0.9056

## Preprocessing

In [74]:
X_train = train_data.drop(columns='Price')
y_train = train_data['Price']

X_test = test_data.copy()

In [75]:
X_train['Engine'] = X_train['Engine'].str[:-3].replace(np.nan, -1).astype('int').replace(-1, np.nan)
X_test['Engine'] = X_test['Engine'].str[:-3].replace(np.nan, -1).astype('int').replace(-1, np.nan)

In [76]:
X_train['Power'] = X_train['Power'].str[:-4].replace(np.nan, -1).replace('null', -1).astype('float').replace(-1, np.nan)
X_test['Power'] = X_test['Power'].str[:-4].replace(np.nan, -1).replace('null', -1).astype('float').replace(-1, np.nan)

In [77]:
X_train['Mileage'] = X_train.Mileage.str.split(expand=True).drop(columns=1).rename(columns={0: 'Mileage'}).Mileage.astype('float')
X_test['Mileage'] = X_test.Mileage.str.split(expand=True).drop(columns=1).rename(columns={0: 'Mileage'}).Mileage.astype('float')

In [78]:
X_train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,8.61 Lakh
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,


### Drop columns

In [79]:
X_train = X_train.drop(columns=['New_Price'])
X_test = X_test.drop(columns=['New_Price'])

In [80]:
X_train.isnull().sum()

Name                   0
Location               0
Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage                2
Engine                36
Power                143
Seats                 42
dtype: int64

In [81]:
X_test.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               0
Engine               10
Power                32
Seats                11
dtype: int64

### Impute missing values

In [82]:
X_train['Mileage'] = X_train.groupby(['Fuel_Type'])['Mileage'].transform(lambda x: x.fillna(x.mean()))

In [83]:
def fill_engine(x):
    return x.sum()/len(x)

X_train['Engine'] = X_train.groupby(['Name'])['Engine'].transform(lambda x: x.fillna(fill_engine(x)))

values = X_test[X_test['Engine'].isnull()]['Name'].values
X_test['Engine'].ix[X_test['Engine'].isnull()] = list(X_train.groupby('Name')['Engine'].mean()[values].values)

X_test['Engine'] = X_test['Engine'].fillna(X_train['Engine'].mean())

In [84]:
def fill_power(x):
    return x.sum()/len(x)

X_train['Power'] = X_train.groupby(['Name'])['Power'].transform(lambda x: x.fillna(fill_power(x)))

values = X_test[X_test['Power'].isnull()]['Name'].values
X_test['Power'].ix[X_test['Power'].isnull()] = list(X_train.groupby('Name')['Power'].mean()[values].values)

X_test['Power'] = X_test['Power'].fillna(X_train['Power'].mean())

In [85]:
def fill_seats(x, X_train):
    try:
        return x.mode()[0]
    except:
        return X_train['Seats'].mode()[0]
    
X_train['Seats'] = X_train.groupby(['Name'])['Seats'].transform(lambda x: x.fillna(fill_seats(x, X_train)))
X_test['Seats'] = X_test['Seats'].fillna(X_train['Seats'].mode()[0])

### Label Encoding

In [86]:
object_columns = ['Name', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type']

for col in object_columns:
    label_encoder = le.fit(list(X_train[col]) + list(X_test[col]))
    X_train[col] = label_encoder.transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])

## Modeling

### XGBoost

In [87]:
XGB_model = xgb.XGBRegressor()

In [88]:
XGB_model.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

### Predict and Save

In [89]:
y_pred = XGB_model.predict(X_test)

In [90]:
y_pred

array([ 4.104073 ,  2.9036531, 17.021458 , ...,  3.6415722,  5.267805 ,
       20.498932 ], dtype=float32)

In [91]:
y_pred_df = pd.DataFrame({'Price': y_pred})

In [96]:
y_pred_df.to_excel('Submissions/3_Predictions.xlsx', index=False)