# Removing all columns without distance information

In [42]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OrdinalEncoder

In [134]:
train_df = pd.read_csv('Train.csv', index_col = 'VehicleID')
test_df = pd.read_csv('Test.csv', index_col = 'VehicleID')

In [94]:
train_df.head()

Unnamed: 0_level_0,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance
VehicleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
VHL12546,Abuja,Honda,Accord Coupe EX V-6,2011,Silver,2.2,Nigerian Used,
VHL18827,Ibadan,Hyundai,Sonata,2012,Silver,3.5,Nigerian Used,125000.0
VHL19499,Lagos,Lexus,RX 350,2010,Red,9.2,Foreign Used,110852.0
VHL17991,Abuja,Mercedes-Benz,GLE-Class,2017,Blue,22.8,Foreign Used,30000.0
VHL12170,Ibadan,Toyota,Highlander,2002,Red,2.6,Nigerian Used,125206.0


# Model to learn relationship between year and distance

In [95]:
short_df = train_df[['Distance', 'Year']]
short_df = short_df.dropna()
short_df['Year'] = short_df['Year'].apply(lambda x: str(x).replace(',', ''))
short_df['Distance'] = short_df['Distance'].apply(lambda x: str(x).replace(',', ''))
short_df['Distance'] = short_df['Distance'].astype(int)
short_df['Year'] = short_df['Year'].astype(int)
X_short_df = np.array(short_df['Year'] - 2000).reshape(-1, 1)
y_short_df = np.array(short_df['Distance']) / 1000
X_train_short_df, X_test_short_df, y_train_short_df, y_test_short_df = train_test_split(X_short_df, y_short_df)
distance_estimator = XGBRegressor(learning_rate = 0.5, booster = 'dart')
distance_estimator.fit(X_train_short_df, y_train_short_df)

XGBRegressor(base_score=0.5, booster='dart', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.5, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

# Preprocessing

In [135]:
train_df = train_df.dropna(subset = ['Amount (Million Naira)']) # Remove data where we have no amount 

In [136]:
train_df['Maker'] = train_df['Maker'].map({'Honda': 1, 'Hyundai': 1, 'Lexus': 1, 'Mercedes-Benz': 2, 'Toyota': 1, 'Acura': 1,
                      'Dodge': 3, 'Nissan': 1, 'Kia': 1, 'BMW': 2, 'Volvo': 1, 'Ford': 2, 'Land Rover': 3,
                      'Lincoln': 2, 'Peugeot': 1, 'Chevrolet': 3, 'Audi': 3, 'Jaguar': 3, 'Infiniti': 1,
                      'Porsche': 3, 'Fiat': 1, 'Maserati': 3, 'Volkswagen': 2, 'Suzuki': 1, 'Bentley': 3,
                      'GAC': 1, 'Mazda': 1, 'Scion': 1, 'Renault': 1, 'Mitsubishi': 1, 'Mini': 2, 'Pontiac': 1,
                      'Cadillac': 3, 'Ferrari': 3, 'Jeep': 2, 'Buick': 1, 'Rolls-Royce': 3, 'GMC': 2, 'Chrysler': 3,
                      'Lamborghini': 3, 'Citroen': 1, 'King': 1, 'BAW': 1, 'Saturn': 1, 'Tata': 1, 'Opel': 1, 
                      'JAC': 1, 'MG': 1, 'Hummer': 2, 'Subaru': 1, 'Rover': 3, 'Saab': 1, 'Skoda': 1, 'IVM': 1,
                      'Brabus': 3})

train_df['Location'] = train_df['Location'].astype('category').cat.codes
train_df['Type'] = train_df['Type'].astype('category').cat.codes
colour_encoder = OrdinalEncoder(handle_unknown='use_encoded_value',
                                 unknown_value=train_df['Colour'].nunique())
colour_encoder.fit(np.array(train_df['Colour']).reshape(-1, 1))
train_df['Colour'] = colour_encoder.transform(np.array(train_df['Colour']).reshape(-1, 1))
train_df['Model'] = train_df['Model'].astype('category').cat.codes

train_df['Year'] = train_df['Year'].apply(lambda x: str(x)[3:]) # Remove 2000 from the 'Year'
train_df = train_df[train_df['Year'] != ''] # Remove empty entries from the 'Year' column

train_df['Type'] = train_df['Type'].fillna(value = train_df['Type'].describe()[2]) # Fill Type with the most occuring type

train_df['Distance'] = train_df['Distance'].apply(lambda x: str(x).replace(',', '')) 
train_df['Distance'] = train_df['Distance'].replace('nan', np.NaN)
train_df = train_df.dropna()
train_df['Distance'] = train_df['Distance'].astype(int) / 1000 # Scale down the Distance

In [137]:
"""maker_map = {}
for i in range(len(train_df['Maker'].unique())):
    value = train_df[train_df['Maker'] == train_df['Maker'].unique()[i]]['Amount (Million Naira)'].sum()
    maker_map[train_df['Maker'].unique()[i]] = value
train_df['Maker Mapped'] = train_df['Maker'].map(maker_map)"""
    
color_map = {}
for i in range(len(train_df['Colour'].unique())):
    value = train_df[train_df['Colour'] == train_df['Colour'].unique()[i]]['Amount (Million Naira)'].sum()
    color_map[train_df['Colour'].unique()[i]] = value
train_df['Colour Mapped'] = train_df['Colour'].map(color_map)

model_map = {}
for i in range(len(train_df['Model'].unique())):
    value = train_df[train_df['Model'] == train_df['Model'].unique()[i]]['Amount (Million Naira)'].sum()
    model_map[train_df['Model'].unique()[i]] = value
train_df['Model Mapped'] = train_df['Model'].map(model_map)

location_map = {}
for i in range(len(train_df['Location'].unique())):
    value = train_df[train_df['Location'] == train_df['Location'].unique()[i]]['Amount (Million Naira)'].sum()
    location_map[train_df['Location'].unique()[i]] = value
train_df['Location Mapped'] = train_df['Location'].map(location_map)

type_map = {}
for i in range(len(train_df['Type'].unique())):
    value = train_df[train_df['Type'] == train_df['Type'].unique()[i]]['Amount (Million Naira)'].sum()
    type_map[train_df['Type'].unique()[i]] = value
train_df['Type Mapped'] = train_df['Type'].map(type_map)

In [138]:
train_df

Unnamed: 0_level_0,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance,Colour Mapped,Model Mapped,Location Mapped,Type Mapped
VehicleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
VHL18827,1,1,1047,12,16.0,3.50,2,125.000,4949.65,195.66,1996.83,5962.0
VHL19499,2,1,906,10,15.0,9.20,1,110.852,2552.42,651.10,21379.56,36468.7
VHL17991,0,2,506,17,2.0,22.80,1,30.000,3008.40,1832.99,28527.75,36468.7
VHL12170,1,1,567,02,15.0,2.60,2,125.206,2552.42,702.92,1996.83,5962.0
VHL15637,2,1,1013,12,2.0,7.76,1,350.882,3008.40,179.55,21379.56,36468.7
...,...,...,...,...,...,...,...,...,...,...,...,...
VHL12205,2,2,734,07,1.0,5.50,1,114.257,21374.80,552.85,21379.56,36468.7
VHL14329,0,1,319,18,7.0,5.70,1,65.000,7027.45,51.82,28527.75,36468.7
VHL10637,0,2,1170,07,19.0,4.00,2,200.000,9477.57,61.75,28527.75,5962.0
VHL15569,2,2,510,12,1.0,8.65,1,85.750,21374.80,86.10,21379.56,36468.7


In [139]:
"""train_df['Distance'] = train_df['Distance'].apply(lambda x: str(x).replace(',', '')) # Remove coma from the Distance column
to_predict = []

for i in range(len(train_df['Distance'])): # Make a list of empty entries on distances 
    if train_df['Distance'][i] == 'nan':
        to_predict.append(train_df['Year'][i]) 
to_predict = np.array(to_predict, dtype = 'int')

predictions = distance_estimator.predict(to_predict.reshape(-1, 1)) # Predict those distances using a Regressor model

nans = train_df[train_df['Distance'] == 'nan']
nans['Distance'] = predictions
train_df['Distance'] = train_df['Distance'].replace('nan', np.NaN)
train_df['Distance'] = train_df['Distance'].astype(float)
dist_predictions = pd.DataFrame({'Distance': predictions * 1000}) # Process the predictions

gen_alt = (alt for alt in dist_predictions.Distance) # Put the predictions back into our frame

for i, distance in enumerate(train_df.Distance):
    if not pd.isnull(distance): continue
    try:
        train_df.Distance[i] = next(gen_alt)
    except StopIteration:
        break"""

"train_df['Distance'] = train_df['Distance'].apply(lambda x: str(x).replace(',', '')) # Remove coma from the Distance column\nto_predict = []\n\nfor i in range(len(train_df['Distance'])): # Make a list of empty entries on distances \n    if train_df['Distance'][i] == 'nan':\n        to_predict.append(train_df['Year'][i]) \nto_predict = np.array(to_predict, dtype = 'int')\n\npredictions = distance_estimator.predict(to_predict.reshape(-1, 1)) # Predict those distances using a Regressor model\n\nnans = train_df[train_df['Distance'] == 'nan']\nnans['Distance'] = predictions\ntrain_df['Distance'] = train_df['Distance'].replace('nan', np.NaN)\ntrain_df['Distance'] = train_df['Distance'].astype(float)\ndist_predictions = pd.DataFrame({'Distance': predictions * 1000}) # Process the predictions\n\ngen_alt = (alt for alt in dist_predictions.Distance) # Put the predictions back into our frame\n\nfor i, distance in enumerate(train_df.Distance):\n    if not pd.isnull(distance): continue\n    try:\n

In [140]:
train_df['Colour Mapped'] = train_df['Colour Mapped'].astype(int) / 1000
train_df['Model Mapped'] = train_df['Model Mapped'].astype(int) / 1000
train_df['Location Mapped'] = train_df['Location Mapped'].astype(int) / 1000
train_df['Type Mapped'] = train_df['Type Mapped'].astype(int) / 1000

In [410]:
train_df['Type Mapped'] = train_df['Type Mapped'].astype('category').cat.codes 
train_df['Model Mapped'] = train_df['Model Mapped'].astype('category').cat.codes 
train_df['Colour Mapped'] = train_df['Colour Mapped'].astype('category').cat.codes 
train_df['Location Mapped'] = train_df['Location Mapped'].astype('category').cat.codes 

In [411]:
train_df

Unnamed: 0_level_0,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance,Colour Mapped,Model Mapped,Location Mapped,Type Mapped
VehicleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
VHL18827,1,1,1047,12,16.0,3.50,2,125.000,15,115,0,1
VHL19499,2,1,906,10,15.0,9.20,1,110.852,13,156,1,3
VHL17991,0,2,506,17,2.0,22.80,1,30.000,14,168,2,3
VHL12170,1,1,567,02,15.0,2.60,2,125.206,13,157,0,1
VHL15637,2,1,1013,12,2.0,7.76,1,350.882,14,110,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...
VHL12205,2,2,734,07,1.0,5.50,1,114.257,18,149,1,3
VHL14329,0,1,319,18,7.0,5.70,1,65.000,16,51,2,3
VHL10637,0,2,1170,07,19.0,4.00,2,200.000,17,58,2,1
VHL15569,2,2,510,12,1.0,8.65,1,85.750,18,79,1,3


In [412]:
features = ['Location', 'Maker', 'Model', 'Year', 'Colour', 'Type', 'Distance', 'Colour Mapped', 'Model Mapped', 'Location Mapped', 'Type Mapped']

In [413]:
X = train_df[features]
y = train_df[['Amount (Million Naira)']]

In [414]:
X = np.array(X)
y = np.array(y).reshape(-1)

In [415]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = .2)

In [416]:
# Model 1: Linear Regressor

model_1 = LinearRegression()

model_1.fit(X_train, y_train)

model_1_pred = model_1.predict(X_valid)

In [417]:
# Model 2: Support Vector Regressor

#model_2_1 = SVR(kernel = 'linear')
model_2_2 = SVR()
#model_2_3 = SVR(kernel = 'rbf')

#model_2_1.fit(X_train, y_train)
model_2_2.fit(X_train, y_train)
#model_2_3.fit(X_train, y_train)

#model_2_1_pred = model_2_1.predict(X_valid)
model_2_2_pred = model_2_2.predict(X_valid)
#model_2_3_pred = model_2_3.predict(X_valid)

In [418]:
# Model 3: Decision Tree Regressor

model_3 = DecisionTreeRegressor()

model_3.fit(X_train, y_train)

model_3_pred = model_3.predict(X_valid)

In [419]:
# Model 4: Random Forest Regressor

model_4 = RandomForestRegressor()

model_4.fit(X_train, y_train)

model_4_pred = model_4.predict(X_valid)

In [420]:
# Model 5: XGBoost Regressor

model_5 = XGBRegressor(learning_rate = 0.02, booster = 'dart')

model_5.fit(X_train, y_train)

model_5_pred = model_5.predict(X_valid)

In [421]:
model_5_2 = XGBRegressor(learning_rate = 0.05, booster = 'dart')

model_5_2.fit(X_train, y_train)

model_5_2_pred = model_5_2.predict(X_valid)

In [422]:
model_5_3 = XGBRegressor(learning_rate = 0.1, booster = 'dart')

model_5_3.fit(X_train, y_train)

model_5_3_pred = model_5_3.predict(X_valid)

In [423]:
model_5_4 = XGBRegressor(learning_rate = 0.1, booster = 'gblinear')

model_5_4.fit(X_train, y_train)

model_5_4_pred = model_5_4.predict(X_valid)

In [424]:
model_5_5 = XGBRegressor(learning_rate = 0.02, booster = 'gblinear')

model_5_5.fit(X_train, y_train)

model_5_5_pred = model_5_5.predict(X_valid)

In [446]:
model_5_6 = XGBRegressor(max_depth = 10, learning_rate = 0.5)

model_5_6.fit(X_train, y_train)

model_5_6_pred = model_5_6.predict(X_valid)

# Evaluate models

In [426]:
model_1_eval = mae(model_1_pred, y_valid), r2_score(model_1_pred, y_valid)
model_1_eval

(8.20312184973479, -2.732371101649616)

In [427]:
#model_2_1_eval = mae(model_2_1_pred, y_valid), r2_score(model_2_1_pred, y_valid)
model_2_2_eval = mae(model_2_2_pred, y_valid), r2_score(model_2_2_pred, y_valid)
#model_2_3_eval = mae(model_2_3_pred, y_valid), r2_score(model_2_3_pred, y_valid)
model_2_2_eval 

(6.760129246035272, -37.200081759618634)

In [428]:
model_3_eval = mae(model_3_pred, y_valid), r2_score(model_3_pred, y_valid)
model_3_eval

(4.428412863070539, 0.26780157279831085)

In [429]:
model_4_eval = mae(model_4_pred, y_valid), r2_score(model_4_pred, y_valid)
model_4_eval

(3.1725479672989523, 0.5213288977210445)

In [430]:
model_5_eval = mae(model_5_pred, y_valid), r2_score(model_5_pred, y_valid)
model_5_eval

(3.488831383777604, 0.19464463395058973)

In [431]:
model_5_2_eval = mae(model_5_2_pred, y_valid), r2_score(model_5_2_pred, y_valid)
model_5_2_eval

(3.182296842000296, 0.5397322000960857)

In [432]:
model_5_3_eval = mae(model_5_3_pred, y_valid), r2_score(model_5_3_pred, y_valid)
model_5_3_eval

(3.026542596873901, 0.5453622340976503)

In [433]:
model_5_4_eval = mae(model_5_4_pred, y_valid), r2_score(model_5_4_pred, y_valid)
model_5_4_eval

(8.179042703061677, -3.458371455668451)

In [434]:
model_5_5_eval = mae(model_5_5_pred, y_valid), r2_score(model_5_5_pred, y_valid)
model_5_5_eval

(7.999770288640533, -10.25080184060208)

In [447]:
model_5_6_eval = mae(model_5_6_pred, y_valid), r2_score(model_5_6_pred, y_valid)
model_5_6_eval

(3.2115691424603283, 0.38432814493282985)

# Best Model: Model 4 (Random Forest Regressor)

In [386]:
pd.DataFrame({'y_true': y_valid[20:40], 'y_pred': model_5_3_pred[20:40]})

Unnamed: 0,y_true,y_pred
0,8.8,10.302006
1,2.3,2.626747
2,3.45,3.690582
3,21.0,18.994057
4,4.0,5.181368
5,2.08,3.565684
6,9.4,11.90458
7,8.5,7.776158
8,4.39,4.85558
9,4.6,5.433408


In [454]:
import fastai
from fastai import *
import fastbook
from fastbook import *
from fastai.tabular.all import *
fastbook.setup_book()

In [464]:
train_df

Unnamed: 0_level_0,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance,Colour Mapped,Model Mapped,Location Mapped,Type Mapped
VehicleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
VHL18827,1,1,1047,12,16.0,3.50,2,125.000,15,115,0,1
VHL19499,2,1,906,10,15.0,9.20,1,110.852,13,156,1,3
VHL17991,0,2,506,17,2.0,22.80,1,30.000,14,168,2,3
VHL12170,1,1,567,02,15.0,2.60,2,125.206,13,157,0,1
VHL15637,2,1,1013,12,2.0,7.76,1,350.882,14,110,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...
VHL12205,2,2,734,07,1.0,5.50,1,114.257,18,149,1,3
VHL14329,0,1,319,18,7.0,5.70,1,65.000,16,51,2,3
VHL10637,0,2,1170,07,19.0,4.00,2,200.000,17,58,2,1
VHL15569,2,2,510,12,1.0,8.65,1,85.750,18,79,1,3


In [472]:
train_df['Year'] = train_df['Year'].astype('int')

In [480]:
dataloader = TabularDataLoaders.from_df(train_df[:4000], y_names = 'Amount (Million Naira)')

In [481]:
dataloader.valid.show_batch()

Unnamed: 0,Colour Mapped,Location Mapped,Year,Distance,Model,Location,Type,Type Mapped,Colour,Maker,Model Mapped,Amount (Million Naira)
0,17.0,2.0,2.0,89.0,583.0,0.0,1.0,3.0,19.0,1.0,3.0,3.2
1,15.0,0.0,11.0,113.567001,1135.0,1.0,2.0,1.0,16.0,1.0,130.0,6.75
2,12.0,0.0,5.0,182.475006,42.0,1.0,2.0,1.0,6.0,1.0,8.0,2.49
3,17.0,2.0,6.0,95.468002,763.0,0.0,1.0,3.0,19.0,1.0,81.0,3.35
4,18.0,2.0,20.0,60.185001,698.0,0.0,2.0,1.0,1.0,1.0,93.0,19.0
5,16.0,1.0,0.0,108.558998,790.0,2.0,2.0,1.0,7.0,1.0,3.0,1.13
6,16.0,1.0,2.0,286.114014,842.0,2.0,2.0,1.0,7.0,1.0,2.0,1.58
7,16.0,1.0,8.0,86.275002,27.0,2.0,1.0,3.0,7.0,1.0,142.0,6.4
8,14.0,2.0,6.0,125.0,921.0,0.0,2.0,1.0,2.0,1.0,13.0,3.5
9,11.0,1.0,15.0,63.691002,910.0,2.0,1.0,3.0,3.0,1.0,160.0,13.5


In [482]:
learn = tabular_learner(dataloader)

In [502]:
train_df.iloc[3000]

Location                    2.000
Maker                       1.000
Model                     924.000
Year                       16.000
Colour                      7.000
Amount (Million Naira)     19.000
Type                        1.000
Distance                   61.803
Colour Mapped              16.000
Model Mapped               19.000
Location Mapped             1.000
Type Mapped                 3.000
Name: VHL18053, dtype: float64

In [501]:
learn.predict(train_df.iloc[3000].drop('Amount (Million Naira)'))

(   Colour Mapped  Location Mapped  Year   Distance  Model  Location  Type  \
 0           16.0              1.0  16.0  61.803001  924.0       2.0   1.0   
 
    Type Mapped  Colour  Maker  Model Mapped  Amount (Million Naira)  
 0          3.0     7.0    1.0          19.0              -12.738526  ,
 tensor([-12.7385]),
 tensor([-12.7385]))

In [395]:
test_df = pd.read_csv('Test.csv', index_col = 'VehicleID')

In [396]:
test_df['Maker'] = test_df['Maker'].map({'Honda': 1, 'Hyundai': 1, 'Lexus': 1, 'Mercedes-Benz': 2, 'Toyota': 1, 'Acura': 1,
                      'Dodge': 3, 'Nissan': 1, 'Kia': 1, 'BMW': 2, 'Volvo': 1, 'Ford': 2, 'Land Rover': 3,
                      'Lincoln': 2, 'Peugeot': 1, 'Chevrolet': 3, 'Audi': 3, 'Jaguar': 3, 'Infiniti': 1,
                      'Porsche': 3, 'Fiat': 1, 'Maserati': 3, 'Volkswagen': 2, 'Suzuki': 1, 'Bentley': 3,
                      'GAC': 1, 'Mazda': 1, 'Scion': 1, 'Renault': 1, 'Mitsubishi': 1, 'Mini': 2, 'Pontiac': 1,
                      'Cadillac': 3, 'Ferrari': 3, 'Jeep': 2, 'Buick': 1, 'Rolls-Royce': 3, 'GMC': 2, 'Chrysler': 3,
                      'Lamborghini': 3, 'Citroen': 1, 'King': 1, 'BAW': 1, 'Saturn': 1, 'Tata': 1, 'Opel': 1, 
                      'JAC': 1, 'MG': 1, 'Hummer': 2, 'Subaru': 1, 'Rover': 3, 'Saab': 1, 'Skoda': 1, 'IVM': 1,
                      'Brabus': 3, 'Seat': 1})

test_df['Location'] = test_df['Location'].astype('category').cat.codes
test_df['Type'] = test_df['Type'].astype('category').cat.codes
test_df['Colour'] = colour_encoder.transform(np.array(test_df['Colour']).reshape(-1, 1))
test_df['Model'] = test_df['Model'].astype('category').cat.codes

test_df['Year'] = test_df['Year'].apply(lambda x: str(x)[3:]) # Remove 2000 from the 'Year'
test_df['Year'] = test_df['Year'].replace('', np.NaN)
test_df['Year'] = test_df['Year'].fillna(value = test_df['Year'].describe()[2])
#test_df = test_df[test_df['Year'] != ''] # Remove empty entries from the 'Year' column

test_df['Type'] = test_df['Type'].fillna(value = test_df['Type'].describe()[2]) # Fill Type with the most occuring type

#X = test_df[['Location', 'Location Mapped', 'Maker', 'Model', 'Year', 'Type', 'Type Mapped', 'Colour', 'Colour Mapped', 'Distance']]

In [397]:
test_df['Distance'] = test_df['Distance'].apply(lambda x: str(x).replace(',', '')) # Remove coma from the Distance column
to_predict = []

for i in range(len(test_df['Distance'])): # Make a list of empty entries on distances 
    if test_df['Distance'][i] == 'nan':
        to_predict.append(test_df['Year'][i]) 
to_predict = np.array(to_predict, dtype = 'int')

predictions = distance_estimator.predict(to_predict.reshape(-1, 1)) # Predict those distances using a Regressor model

nans = test_df[test_df['Distance'] == 'nan']
nans['Distance'] = predictions
test_df['Distance'] = test_df['Distance'].replace('nan', np.NaN)
test_df['Distance'] = test_df['Distance'].astype(float)
dist_predictions = pd.DataFrame({'Distance': predictions * 1000}) # Process the predictions

gen_alt = (alt for alt in dist_predictions.Distance) # Put the predictions back into our frame

for i, distance in enumerate(test_df.Distance):
    if not pd.isnull(distance): continue
    try:
        test_df.Distance[i] = next(gen_alt)
    except StopIteration:
        break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nans['Distance'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.Distance[i] = next(gen_alt)


In [398]:
test_df['Type Mapped'] = test_df['Type'].map(type_map)
test_df['Colour Mapped'] = test_df['Colour'].map(color_map)
test_df['Colour Mapped'] = test_df['Colour Mapped'].fillna(test_df['Colour Mapped'].mean())
test_df['Model Mapped'] = test_df['Model'].map(model_map)
test_df['Model Mapped'] = test_df['Model Mapped'].fillna(test_df['Model Mapped'].mean())
test_df['Location Mapped'] = test_df['Location'].map(location_map)

In [399]:
test_df['Type Mapped'] = test_df['Type Mapped'].astype('category').cat.codes 
test_df['Model Mapped'] = test_df['Model Mapped'].astype('category').cat.codes 
test_df['Colour Mapped'] = test_df['Colour Mapped'].astype('category').cat.codes 
test_df['Location Mapped'] = test_df['Location Mapped'].astype('category').cat.codes 

['Location',
 'Maker',
 'Model',
 'Year',
 'Colour',
 'Type',
 'Distance',
 'Colour Mapped',
 'Model Mapped',
 'Location Mapped',
 'Type Mapped']

In [400]:
test_df

Unnamed: 0_level_0,Location,Maker,Model,Year,Colour,Type,Distance,Type Mapped,Colour Mapped,Model Mapped,Location Mapped
VehicleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
VHL18518,0,2,8,08,19.0,1,30524.000000,36468.7,9477.57,43.80,28527.75
VHL17149,2,1,123,13,19.0,1,91557.460938,36468.7,9477.57,13.64,21379.56
VHL10927,2,1,272,05,6.0,1,161786.406250,36468.7,1300.83,3.50,21379.56
VHL12909,2,1,123,11,7.0,1,166839.000000,36468.7,7027.45,13.64,21379.56
VHL12348,2,1,192,13,15.0,1,88862.000000,36468.7,2552.42,1.35,21379.56
...,...,...,...,...,...,...,...,...,...,...,...
VHL17903,0,1,74,20,15.0,1,2650.000000,36468.7,2552.42,1.80,28527.75
VHL14018,1,3,391,11,2.0,1,99000.000000,36468.7,3008.40,7.48,1996.83
VHL17473,1,1,123,15,1.0,1,108000.000000,36468.7,21374.80,13.64,1996.83
VHL11480,0,1,302,13,19.0,1,52485.000000,36468.7,9477.57,2.48,28527.75


In [401]:
test_df['Colour Mapped'] = test_df['Colour Mapped'].astype(int) / 1000
test_df['Location Mapped'] = test_df['Location Mapped'].astype(int) / 1000
test_df['Type Mapped'] = test_df['Type Mapped'].astype(int) / 1000
test_df['Distance'] = test_df['Distance'].astype(int) / 1000
test_df['Model Mapped'] = test_df['Model Mapped'].astype(int) / 1000

In [315]:
test_df['Colour Mapped'].astype('category').cat.codes

VehicleID
VHL18518    13
VHL17149    13
VHL10927     8
VHL12909    12
VHL12348     9
            ..
VHL17903     9
VHL14018    10
VHL17473    15
VHL11480    13
VHL13881    11
Length: 2061, dtype: int8

In [402]:
test_df

Unnamed: 0_level_0,Location,Maker,Model,Year,Colour,Type,Distance,Type Mapped,Colour Mapped,Model Mapped,Location Mapped
VehicleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
VHL18518,0,2,8,08,19.0,1,30.524,36.468,9.477,0.043,28.527
VHL17149,2,1,123,13,19.0,1,91.557,36.468,9.477,0.013,21.379
VHL10927,2,1,272,05,6.0,1,161.786,36.468,1.300,0.003,21.379
VHL12909,2,1,123,11,7.0,1,166.839,36.468,7.027,0.013,21.379
VHL12348,2,1,192,13,15.0,1,88.862,36.468,2.552,0.001,21.379
...,...,...,...,...,...,...,...,...,...,...,...
VHL17903,0,1,74,20,15.0,1,2.650,36.468,2.552,0.001,28.527
VHL14018,1,3,391,11,2.0,1,99.000,36.468,3.008,0.007,1.996
VHL17473,1,1,123,15,1.0,1,108.000,36.468,21.374,0.013,1.996
VHL11480,0,1,302,13,19.0,1,52.485,36.468,9.477,0.002,28.527


In [403]:
X_test = test_df[features]

In [404]:
X_test = np.array(X_test)

In [405]:
final_predictions = model_3.predict(X_test)

In [406]:
VehicleIDs = test_df.reset_index()['VehicleID']

In [407]:
submission = pd.DataFrame(data = final_predictions, columns = ['Amount (Million Naira)'], index = VehicleIDs)

In [408]:
submission.to_csv('submission.csv')

In [409]:
submission

Unnamed: 0_level_0,Amount (Million Naira)
VehicleID,Unnamed: 1_level_1
VHL18518,2.95
VHL17149,8.20
VHL10927,2.48
VHL12909,6.20
VHL12348,2.81
...,...
VHL17903,16.00
VHL14018,2.50
VHL17473,10.20
VHL11480,2.95
