In [30]:
# Import Necessary Packages 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette('muted')
sns.set_color_codes('muted')
sns.set_style('white')

import numpy as np 
import pandas as pd 

from sklearn.linear_model import Ridge, RidgeCV, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


from scipy import stats
import pandas as pd

In [7]:
%config InlineBackend.figure_format = 'retina'

In [32]:
# Import Data 

train = pd.DataFrame(pd.read_csv('cars.csv'))
test = pd.DataFrame(pd.read_csv('cars_test.csv'))



# Check train and test dataframes 

Features description

Source: https://www.kaggle.com/adityadesai13/used-car-dataset-ford-and-mercedes

\>100K used cars postings from the British used cars site.

- **brand** car manufacturer
- **model** car model
- **year** registration year
- **transmission** type of gearbox (Manual, Semi-Auto, Automatic, Other)
- **mileage** distance used, miles
- **fuelType** engine fuel type (Diesel, Petrol, Hybrid, Electric, Other)
- **tax** road tax, £
- **mpg** miles per gallon (how many miles car can cover using 1 gallon of fuel; more miles -- less money spent on fuel)
- **engineSize** engine size (volume) in litres
- **tax(£)** road tax, £
- **price** car price, £

In [369]:
train.isnull().sum()

id                  0
brand               0
model               0
year                9
transmission        0
mileage             0
fuelType            0
tax             12976
mpg              8520
engineSize          0
tax(£)          94479
price               0
dtype: int64

In [62]:
(test.tax.isnull().sum() - test['tax(£)'].notnull().sum())/len(test)

0.08672566371681416

In [63]:
(train.tax.isnull().sum() - train['tax(£)'].notnull().sum())/len(train)

0.08611714762217618

In [33]:
train.head()

Unnamed: 0,id,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,tax(£),price
0,XFAD75P7,audi,A3,2011.0,Manual,85246,Diesel,30.0,65.7,1.6,,5899
1,73W4FEUS,Mercedes-Benz,GLE Class,2019.0,Semi-Auto,2343,Diesel,145.0,32.8,3.0,,52700
2,MZWIK5PF,ford,Fiesta,2016.0,Manual,33000,Petrol,0.0,65.7,1.0,,8974
3,CJ2UPZT7,Hyundai,I10,2014.0,Manual,13000,Petrol,,61.4,1.2,20.0,4795
4,GCE9RQX3,vw,Polo,2019.0,Semi-Auto,1889,Petrol,145.0,45.6,1.0,,15399


In [34]:
# Data pre-processing 

def get_age(df): 
    
    df['age'] = 2021 - df['year']
    
    return df


def col_transformations(df): 
    
    df.loc[df['age'] < 0, 'age'] = df['age'].mode()
    
    df['tax_filled'] = df.apply(fill_tax, axis=1)
    
    df['miles_year'] = df['mileage'] / df['age']
    
    liter_to_galoon_coef = 0.21997
    df['engineSize'] = df['engineSize'] * liter_to_galoon_coef
    
    
    df.loc[df["age"] < 10, "is_old"] = 0
    df.loc[df["age"] > 10, "is_old"] = 1
    df['is_old'] = df['is_old'].fillna(0)
    
    
    return df


def lower_strip(df): 
    
    for i in ["brand", "model", "transmission", "mileage", "fuelType"]: 
        df[i] = df[i].astype('str')
        df[i] = df[i].str.strip()
        df[i] = df[i].str.lower()
        
    return df


def preprocess(df):
    
    df['tax_filled'] = df['tax_filled'].fillna(df['tax_filled'].median())
    df['mpg'] = df['mpg'].fillna(df['mpg'].median())
    df['age'] = df['age'].fillna(df['age'].mean())
    df['engineSize'] = df['engineSize'].fillna(df['engineSize'].median())
    df['miles_year'] = df['miles_year'].fillna(df['miles_year'].mean())
    df['year'] = df['year'].fillna(df['year'].median())
    
    
    return df 


def adjustments(df): 
    
    df['price'] = df['price'].astype('float')
    df['price'] = np.log(df.price)
    df['mileage'] = df['mileage'].astype('float')
    df['mileage'] = np.log(df.mileage)
    
    
#     df['miles_year'] = np.log(df.miles_year)
    
    return df


def fill_tax(row): 
    if (row['tax'] is np.nan):
        return row['tax(£)']
    else: 
        return row['tax']


def get_dummies(df, cols): 
    
    df = pd.get_dummies(df, columns=cols, drop_first=True)
    return df 


def drop_cols(df, cols): 
    
    df = df.drop(cols, axis=1)
    return df



In [36]:
# split into df's with known and unknown taxes

def merge_taxes(df):

    df["tax"].fillna(df["tax(£)"], inplace = True)
    
    return df


def split_taxes(df):
    
    df = merge_taxes(df)
    df = lower_strip(df)
    df['price'] = df['price'].astype('float')
    df['mileage'] = df['mileage'].astype('float')
    
    df = get_dummies(df, ['brand','model', 'transmission', 'fuelType'])
    
    
    df_filled = df.drop(df[df.tax.isnull()].index)
    df_null = df.drop(df[df.tax.notnull()].index)
    
    return df_filled, df_null

In [37]:
# - WITHOUT DUMMIES and lower_strip, classic preparation for df with known taxes for X in Tax prediction 

def data_prep_for_tax(df):
    
    df = get_age(df)
    df = col_transformations(df)

    df = preprocess(df)
    df = adjustments(df)

    df = drop_cols(df, ['tax', 'tax(£)', 'year'])
    
#     df = stand_scaling(df)
    
#     df = pca_transform(df)
    
    return df 


In [38]:
# split check

train_tax = train.copy()

full, null = split_taxes(train_tax)

full = data_prep_for_tax(full)

In [39]:
full

Unnamed: 0,id,mileage,mpg,engineSize,price,brand_bmw,brand_ford,brand_hyundai,brand_mercedes-benz,brand_skoda,...,transmission_other,transmission_semi-auto,fuelType_electric,fuelType_hybrid,fuelType_other,fuelType_petrol,age,tax_filled,miles_year,is_old
0,XFAD75P7,11.353296,65.7,0.351952,8.682538,0,0,0,0,0,...,0,0,0,0,0,0,10.0,30.0,8524.600000,0.0
1,73W4FEUS,7.759187,32.8,0.659910,10.872371,0,0,0,1,0,...,0,1,0,0,0,0,2.0,145.0,1171.500000,0.0
2,MZWIK5PF,10.404263,65.7,0.219970,9.102087,0,1,0,0,0,...,0,0,0,0,0,1,5.0,0.0,6600.000000,0.0
3,CJ2UPZT7,9.472705,61.4,0.263964,8.475329,0,0,1,0,0,...,0,0,0,0,0,1,7.0,20.0,1857.142857,0.0
4,GCE9RQX3,7.543803,45.6,0.219970,9.642058,0,0,0,0,0,...,0,1,0,0,0,1,2.0,145.0,944.500000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98930,I8KLC618,8.517193,38.7,0.439940,10.427920,0,0,0,0,0,...,0,1,0,0,0,0,2.0,145.0,2500.000000,0.0
98931,AB7QQKMO,10.264443,39.2,0.439940,9.825039,0,1,0,0,0,...,0,0,0,0,0,0,3.0,150.0,9564.666667,0.0
98932,PVLKZ2UI,8.401558,38.2,0.439940,10.341420,0,0,0,0,0,...,0,1,0,0,0,1,1.0,145.0,4454.000000,0.0
98933,2GTEIIRK,8.994917,65.7,0.219970,9.157994,0,1,0,0,0,...,0,0,0,0,0,1,4.0,145.0,2015.500000,0.0


In [105]:
null

Unnamed: 0,id,year,mileage,tax,mpg,engineSize,tax(£),price,brand_bmw,brand_ford,...,model_z4,model_zafira,model_zafira tourer,transmission_manual,transmission_other,transmission_semi-auto,fuelType_electric,fuelType_hybrid,fuelType_other,fuelType_petrol
6,K36NHGRD,2012.0,96000.0,,,2.1,,7490.0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,55Z03X03,2019.0,5433.0,,,1.5,,26099.0,0,0,...,0,0,0,0,0,1,0,0,0,1
35,DUBU62J2,2019.0,419.0,,,2.3,,26500.0,0,1,...,0,0,0,1,0,0,0,0,0,1
37,ASTWHVSR,2017.0,14439.0,,,1.5,,12099.0,0,1,...,0,0,0,1,0,0,0,0,0,0
43,H1XBFGEN,2019.0,4084.0,,,1.5,,18020.0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98881,88A1HIUR,2016.0,41974.0,,,2.1,,16198.0,0,0,...,0,0,0,1,0,0,0,0,0,0
98899,XP635AZK,2018.0,13856.0,,,2.0,,20000.0,0,1,...,0,0,0,1,0,0,0,0,0,1
98905,XQH6XYL9,2019.0,1566.0,,,1.0,,19970.0,0,1,...,0,0,0,1,0,0,0,0,0,1
98907,VTXFQBW7,2015.0,45588.0,,,1.0,,9362.0,0,1,...,0,0,0,1,0,0,0,0,0,1


In [40]:
# extracting X and y for tax prediction 

def get_xy_taxes(df, y):
        
    y = df[y]

    
    X = drop_cols(df, ['tax_filled', 'id', 'price'])
    
    return X, y


In [41]:
# JUST TO COMPARE WITH BASELINE - fit model for taxes and test its score for Random Forest


def train_and_test_taxes(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    lr = RandomForestRegressor(n_estimators=100)
    
    lr_model = lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_test)
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    score = lr.score(X_test, y_test)
    
    print("RMSE: ", rmse)
    print("Score: ", score)
    
    return y_test, y_pred, X_test, X_train



# train_tax = train.copy()

# train_tax = data_prep_for_tax(train_tax)

# X, y = get_xy_taxes(train_tax, 'tax_filled')

# y_test, y_pred, X_test, X_train = train_and_test_taxes(X, y)








#  shows RMSE = 24.6, better than any replacement strategy in fillna()


In [110]:
# baseline for tax - compare the RF tax prediction with fillna() strategies

def tax_baseline(y_test):
    
    len_test = len(y_test)

    mean_pred = np.array([y_test.mean()]*len_test)
    median_pred = np.array([y_test.median()]*len_test)
    mode_pred = np.array([y_test.mode()]*len_test)
    zero_pred = np.array([0]*len_test)
    
    print("mean baseline RMSE: ", mean_squared_error(y_test, mean_pred, squared=False))
    print("median baseline RMSE: ", mean_squared_error(y_test, median_pred, squared=False))
    print("mode RMSE: ", mean_squared_error(y_test, mode_pred, squared=False))
    print("zero RMSE: ", mean_squared_error(y_test, zero_pred, squared=False))

tax_baseline(train_with_taxes.tax_filled)

mean baseline RMSE:  60.874930353481716
median baseline RMSE:  65.11629788645503
mode RMSE:  65.11629788645503
zero RMSE:  136.2399771147967


In [42]:
# extracting X and y for mpg prediction 

def get_xy_mpg(df, y):
        
    y = df[y]
    
    X = drop_cols(df, ['mpg', 'id', 'price'])
    
    return X, y

In [43]:
def train_mpg_model(df):

    X, y = get_xy_mpg(df, 'mpg')


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    

    lr = RandomForestRegressor(n_estimators=100)
    
    lr_model = lr.fit(X_train, y_train)
    
    
    return lr_model

In [44]:

def train_tax_model(df):

    X, y = get_xy_taxes(df, 'tax_filled')


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    

    lr = RandomForestRegressor(n_estimators=100)
    
    lr_model = lr.fit(X_train, y_train)
    
    
    return lr_model


In [103]:


def predict_and_merge_taxes(train_tax):
    
    train_tax, tax_null = split_taxes(train_tax)

    train_tax = data_prep_for_tax(train_tax)


    model = train_tax_model(train_tax)


    tax_null['mpg'] = train_tax.mpg.median()
    tax_null = data_prep_for_tax(tax_null)

    X, y = get_xy_taxes(tax_null, 'tax_filled')

    tax_null['tax_filled'] = model.predict(X)
    
#     ---------------------------------------
#     predictions for mpg
    
    model_mpg = train_mpg_model(train_tax)
    
    X, y = get_xy_mpg(tax_null, 'mpg')
    
    tax_null['mpg'] = model_mpg.predict(X)
    
#     ---------------------------------------

    merged_df = pd.concat([train_tax, tax_null])

    return merged_df, model, model_mpg



train_for_tax = train.copy()
train_with_taxes, model_tax, model_mpg = predict_and_merge_taxes(train_for_tax)


In [46]:
train_with_taxes

Unnamed: 0,id,mileage,mpg,engineSize,price,brand_bmw,brand_ford,brand_hyundai,brand_mercedes-benz,brand_skoda,...,transmission_other,transmission_semi-auto,fuelType_electric,fuelType_hybrid,fuelType_other,fuelType_petrol,age,tax_filled,miles_year,is_old
0,XFAD75P7,11.353296,65.700,0.351952,8.682538,0,0,0,0,0,...,0,0,0,0,0,0,10.0,30.00,8524.600000,0.0
1,73W4FEUS,7.759187,32.800,0.659910,10.872371,0,0,0,1,0,...,0,1,0,0,0,0,2.0,145.00,1171.500000,0.0
2,MZWIK5PF,10.404263,65.700,0.219970,9.102087,0,1,0,0,0,...,0,0,0,0,0,1,5.0,0.00,6600.000000,0.0
3,CJ2UPZT7,9.472705,61.400,0.263964,8.475329,0,0,1,0,0,...,0,0,0,0,0,1,7.0,20.00,1857.142857,0.0
4,GCE9RQX3,7.543803,45.600,0.219970,9.642058,0,0,0,0,0,...,0,1,0,0,0,1,2.0,145.00,944.500000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98881,88A1HIUR,10.644806,54.628,0.461937,9.692643,0,0,0,1,0,...,0,0,0,0,0,0,5.0,145.35,8394.800000,0.0
98899,XP635AZK,9.536474,41.500,0.439940,9.903488,0,1,0,0,0,...,0,0,0,0,0,1,3.0,147.10,4618.666667,0.0
98905,XQH6XYL9,7.356280,54.265,0.219970,9.901986,0,1,0,0,0,...,0,0,0,0,0,1,2.0,146.60,783.000000,0.0
98907,VTXFQBW7,10.727400,52.295,0.219970,9.144414,0,1,0,0,0,...,0,0,0,0,0,1,6.0,73.70,7598.000000,0.0


In [61]:
sum(train_with_taxes.isnull().sum())

0

In [47]:
def get_xy(df, y): 
    
    y = df[y]
    
    X = drop_cols(df, ['price', 'id'])
    
    return X, y

# Random Forest Regressor 


In [64]:
def rf(X, y): 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    lr = RandomForestRegressor(n_estimators=100)

    lm = lr.fit(X_train, y_train)
    
    print("Training set score: {:.2f}".format(lm.score(X_train, y_train)))
    print("Test set score: {:.2f}".format(lm.score(X_test, y_test)))
    
    y_pred = lm.predict(X_test)
    
    y_pred = np.exp(y_pred)
    y_test = np.exp(y_test)
    
    # get MSE 
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    print("RMSE: ", rmse)
    
    return lm, y_pred, y_test

In [104]:
# rf for tax predictions

X, y = get_xy(train_with_taxes, 'price')
model, y_pred, y_test = rf(X, y)

Training set score: 0.99
Test set score: 0.96
RMSE:  1901.4770942672021


In [23]:
def feature_importances(model, X):
    
    data = model.feature_importances_.round(3)
    rows = X.columns
    df = pd.DataFrame(data=data, index=rows).sort_values(0)
    
    return df

In [752]:
features = feature_importances(model, X)

In [753]:
features.tail(50)

Unnamed: 0,0
model_a4,0.001
fuelType_petrol,0.001
brand_bmw,0.001
model_a1,0.001
model_ix20,0.001
model_grandland x,0.001
transmission_semi-auto,0.001
brand_skoda,0.001
model_x5,0.001
model_2 series,0.001


In [650]:
def get_final_x(df):
    
    X = drop_cols(df, ['price', 'id'])
    
    return X

In [640]:
def final_predict_and_merge_taxes(train_tax, model_tax, model_mpg):
    
# add price column to be removed later to avoid issues with prep 
    train_tax['price'] = 10
    
#     ---------------------------------------
#     preprocessing and predictions for taxes 
    
    train_tax, tax_null = split_taxes(train_tax)

    train_tax = data_prep_for_tax(train_tax)



    tax_null['mpg'] = train_tax.mpg.median()
    tax_null = data_prep_for_tax(tax_null)

    X, y = get_xy_taxes(tax_null, 'tax_filled')

    tax_null['tax_filled'] = model_tax.predict(X)
    
#     ---------------------------------------
#     predictions for mpg
    

    
    X, y = get_xy_mpg(tax_null, 'mpg')
    
    tax_null['mpg'] = model_mpg.predict(X)
    
#     ---------------------------------------

    merged_df = pd.concat([train_tax, tax_null])

    
    return merged_df

In [788]:
# predicting 

test3 = test.copy()
test3_with_taxes = final_predict_and_merge_taxes(test3, model_tax, model_mpg)



X = get_final_x(test3_with_taxes)


y_pred = np.exp(model.predict(X))

In [789]:
my_submission = pd.DataFrame({'id': test3_with_taxes.id, 'price': y_pred})

my_submission.to_csv('carPrices_submission_3.csv', index=False)

### GridSearch for Random Forest 

In [61]:
from sklearn.model_selection import RandomizedSearchCV

def gs_rf(X, y): 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 90, stop = 110, num = 3)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 2, cv = 3, verbose=2, random_state=0, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    
    return rf_random.best_params_
        
    

In [62]:
gs_rf(X, y)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 110,
 'bootstrap': True}

### Compare baseline RF to best RF 

In [63]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy




In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
#     print('Accuracy = {:0.2f}%.'.format(mean_squared_error()))
    return accuracy


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

base_model = RandomForestRegressor(n_estimators = 100, random_state = 0)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 0.0709 degrees.
Accuracy = 99.25%.


In [66]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

NameError: name 'rf_random' is not defined

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))
Improvement of 0.40%.