In [909]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score

In [826]:
train_df = pd.read_csv('Train.csv', index_col = 'VehicleID')
test_df = pd.read_csv('Test.csv', index_col = 'VehicleID')

# Model to learn relationship between year and distance

In [827]:
short_df = train_df[['Distance', 'Year']]
short_df = short_df.dropna()
short_df['Year'] = short_df['Year'].apply(lambda x: str(x).replace(',', ''))
short_df['Distance'] = short_df['Distance'].apply(lambda x: str(x).replace(',', ''))
short_df['Distance'] = short_df['Distance'].astype(int)
short_df['Year'] = short_df['Year'].astype(int)
X_short_df = np.array(short_df['Year'] - 2000).reshape(-1, 1)
y_short_df = np.array(short_df['Distance']) / 1000
X_train_short_df, X_test_short_df, y_train_short_df, y_test_short_df = train_test_split(X_short_df, y_short_df)
distance_estimator = SVR()
distance_estimator.fit(X_train_short_df, y_train_short_df)

SVR()

# Preprocessing

In [828]:
train_df = train_df.dropna(subset = ['Amount (Million Naira)']) # Remove data where we have no amount 

In [829]:
train_df = train_df.drop('Maker', axis = 1) # Remove the 'Maker' column

train_df = train_df.drop('Model', axis = 1) # Remove the 'Model' column

train_df['Year'] = train_df['Year'].apply(lambda x: str(x)[3:]) # Remove 2000 from the 'Year'
train_df = train_df[train_df['Year'] != ''] # Remove empty entries from the 'Year' column

train_df['Colour'] = train_df['Colour'].astype('category').cat.codes # Preprocess Colour

train_df['Type'] = train_df['Type'].fillna(value = train_df['Type'].describe()[2]) # Fill Type with the most occuring type

In [830]:
train_df['Distance'] = train_df['Distance'].apply(lambda x: str(x).replace(',', '')) # Remove coma from the Distance column
to_predict = []

for i in range(len(train_df['Distance'])): # Make a list of empty entries on distances 
    if train_df['Distance'][i] == 'nan':
        to_predict.append(train_df['Year'][i]) 
to_predict = np.array(to_predict, dtype = 'int')

predictions = distance_estimator.predict(to_predict.reshape(-1, 1)) # Predict those distances using a Regressor model

nans = train_df[train_df['Distance'] == 'nan']
nans['Distance'] = predictions
train_df['Distance'] = train_df['Distance'].replace('nan', np.NaN)
train_df['Distance'] = train_df['Distance'].astype(float)
dist_predictions = pd.DataFrame({'Distance': predictions * 1000}) # Process the predictions

gen_alt = (alt for alt in dist_predictions.Distance) # Put the predictions back into our frame

for i, distance in enumerate(train_df.Distance):
    if not pd.isnull(distance): continue
    try:
        train_df.Distance[i] = next(gen_alt)
    except StopIteration:
        break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nans['Distance'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.Distance[i] = next(gen_alt)


In [846]:
train_df = pd.concat([pd.get_dummies(train_df['Location']), train_df], axis = 1) # Preprocess Type
train_df = train_df.drop('Location', axis = 1) # Remove the categorical 'Location' column

train_df = pd.concat([pd.get_dummies(train_df['Type']), train_df], axis = 1) # Preprocess Location
train_df = train_df.drop('Type', axis = 1) # Remove the categorical 'Type' column

In [852]:
train_df['Distance'] = train_df['Distance'] / 1000 # Scale down the Distance

In [872]:
X = train_df[['Brand New', 'Foreign Used', 'Nigerian Used', 'Abuja', 'Ibadan', 'Lagos', 'Year', 'Colour', 'Distance']]
y = train_df[['Amount (Million Naira)']]

In [881]:
X = np.array(X)
y = np.array(y).reshape(-1)

# Import Models

In [1038]:
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [883]:
# Model 1: Linear Regressor

model_1 = LinearRegression()

model_1.fit(X, y)

model_1_pred = model_1.predict(X)

In [884]:
# Model 2: Support Vector Regressor

model_2 = SVR()

model_2.fit(X, y)

model_2_pred = model_2.predict(X)

In [885]:
# Model 3: RANSAC Regressor

model_3 = RANSACRegressor()

model_3.fit(X, y)

model_3_pred = model_3.predict(X)

In [886]:
# Model 4: Decision Tree Regressor

model_4 = DecisionTreeRegressor()

model_4.fit(X, y)

model_4_pred = model_4.predict(X)

In [887]:
# Model 5: Huber Regressor

model_5 = HuberRegressor()

model_5.fit(X, y)

model_5_pred = model_5.predict(X)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [888]:
# Model 6: Random Forest Regressor

model_6 = RandomForestRegressor()

model_6.fit(X, y)

model_6_pred = model_6.predict(X)

In [1039]:
# Model 7: XGBoost Regressor

model_7 = XGBRegressor()

model_7.fit(X, y)

model_7_pred = model_7.predict(X)

# Evaluate models

In [910]:
model_1_eval = mae(model_1_pred, y), r2_score(model_1_pred, y)
model_1_eval

(7.875000135895567, -1.2012739573903626)

In [911]:
model_2_eval = mae(model_2_pred, y), r2_score(model_2_pred, y)
model_2_eval

(6.212852897765491, -12.258436338434581)

In [912]:
model_3_eval = mae(model_3_pred, y), r2_score(model_3_pred, y)
model_3_eval

(7.179277116207951, -21.441343632164454)

In [913]:
model_4_eval = mae(model_4_pred, y), r2_score(model_4_pred, y)
model_4_eval

(1.9119914710884038, 0.7045853219661491)

In [914]:
model_5_eval = mae(model_5_pred, y), r2_score(model_5_pred, y)
model_5_eval

(6.743606041428859, -6.869522363489231)

In [915]:
model_6_eval = mae(model_6_pred, y), r2_score(model_6_pred, y)
model_6_eval

(3.306169134887186, 0.5614248860743154)

In [1040]:
model_7_eval = mae(model_7_pred, y), r2_score(model_7_pred, y)
model_7_eval

(3.5783997620326775, 0.6365910737704159)

# Best Model: Model 4 (Random Forest Regressor)

In [1042]:
pd.DataFrame({'y_true': y[100:200], 'y_pred': model_4_pred[100:200]})

Unnamed: 0,y_true,y_pred
0,52.00,52.000000
1,2.48,2.480000
2,16.55,18.891667
3,45.00,45.000000
4,2.40,2.245000
...,...,...
95,6.25,8.080625
96,5.80,5.550000
97,10.00,10.000000
98,3.15,3.150000


In [1010]:
test_df = pd.read_csv('Test.csv', index_col = 'VehicleID')

In [1011]:
test_df = test_df.drop('Maker', axis = 1) # Remove the 'Maker' column

test_df = test_df.drop('Model', axis = 1) # Remove the 'Model' column

test_df['Year'] = test_df['Year'].apply(lambda x: str(x)[3:]) # Remove 2000 from the 'Year'
test_df['Year'] = test_df['Year'].replace('', int(test_df['Year'].mode()))
test_df['Year'] = test_df['Year'].fillna(test_df['Year'].mode())

test_df['Colour'] = test_df['Colour'].astype('category').cat.codes # Preprocess Colour

test_df['Type'] = test_df['Type'].fillna(value = test_df['Type'].describe()[2]) # Fill Type with the most occuring type

In [1012]:
test_df['Distance'] = test_df['Distance'].apply(lambda x: str(x).replace(',', '')) # Remove coma from the Distance column
to_predict = []

for i in range(len(test_df['Distance'])): # Make a list of empty entries on distances 
    if test_df['Distance'][i] == 'nan':
        to_predict.append(test_df['Year'][i]) 
to_predict = np.array(to_predict, dtype = 'int')

predictions = distance_estimator.predict(to_predict.reshape(-1, 1)) # Predict those distances using a Regressor model

nans = test_df[test_df['Distance'] == 'nan']
nans['Distance'] = predictions
test_df['Distance'] = test_df['Distance'].replace('nan', np.NaN)
test_df['Distance'] = test_df['Distance'].astype(float)
dist_predictions = pd.DataFrame({'Distance': predictions * 1000}) # Process the predictions

gen_alt = (alt for alt in dist_predictions.Distance) # Put the predictions back into our frame

for i, distance in enumerate(test_df.Distance):
    if not pd.isnull(distance): continue
    try:
        test_df.Distance[i] = next(gen_alt)
    except StopIteration:
        break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nans['Distance'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.Distance[i] = next(gen_alt)


In [1013]:
test_df = pd.concat([pd.get_dummies(test_df['Location']), test_df], axis = 1) # Preprocess Type
test_df = test_df.drop('Location', axis = 1) # Remove the categorical 'Location' column

test_df = pd.concat([pd.get_dummies(test_df['Type']), test_df], axis = 1) # Preprocess Location
test_df = test_df.drop('Type', axis = 1) # Remove the categorical 'Type' column

In [1014]:
test_df['Distance'] = test_df['Distance'] / 1000 # Scale down the Distance

In [1015]:
X_test = test_df[['Brand New', 'Foreign Used', 'Nigerian Used', 'Abuja', 'Ibadan', 'Lagos', 'Year', 'Colour', 'Distance']]

In [1016]:
X_test = np.array(X_test)

In [1017]:
final_predictions = model_4.predict(X_test)

In [1018]:
VehicleIDs = test_df.reset_index()['VehicleID']

In [1019]:
submission = pd.DataFrame(data = final_predictions, columns = ['Amount (Million Naira)'], index = VehicleIDs)

In [1022]:
submission.to_csv('submission.csv')