# STEP 1: IMPORTING OUR LIBRARIES

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# STEP 2: LOADING OUR DATASET

In [3]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

# To check our train dataset
display(train.head())

Unnamed: 0,VehicleID,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance
0,VHL12546,Abuja,Honda,Accord Coupe EX V-6,2011,Silver,2.2,Nigerian Used,
1,VHL18827,Ibadan,Hyundai,Sonata,2012,Silver,3.5,Nigerian Used,125000.0
2,VHL19499,Lagos,Lexus,RX 350,2010,Red,9.2,Foreign Used,110852.0
3,VHL17991,Abuja,Mercedes-Benz,GLE-Class,2017,Blue,22.8,Foreign Used,30000.0
4,VHL12170,Ibadan,Toyota,Highlander,2002,Red,2.6,Nigerian Used,125206.0


In [4]:
#To check our test dataset
display(test.head())

Unnamed: 0,VehicleID,Location,Maker,Model,Year,Colour,Type,Distance
0,VHL18518,Abuja,BMW,323i,2008,White,Foreign Used,30524.0
1,VHL17149,Lagos,Toyota,Camry,2013,White,Foreign Used,
2,VHL10927,Lagos,Toyota,Highlander Limited V6,2005,Gold,Foreign Used,
3,VHL12909,Lagos,Toyota,Camry,2011,Gray,Foreign Used,166839.0
4,VHL12348,Lagos,Lexus,ES 350 FWD,2013,Red,Foreign Used,88862.0


In [5]:
#Information on our train dataset

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7205 entries, 0 to 7204
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   VehicleID               7205 non-null   object 
 1   Location                7205 non-null   object 
 2   Maker                   7205 non-null   object 
 3   Model                   7205 non-null   object 
 4   Year                    7184 non-null   object 
 5   Colour                  7205 non-null   object 
 6   Amount (Million Naira)  7188 non-null   float64
 7   Type                    7008 non-null   object 
 8   Distance                4845 non-null   object 
dtypes: float64(1), object(8)
memory usage: 506.7+ KB


In [6]:
#Information on our test dataset
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2061 entries, 0 to 2060
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   VehicleID  2061 non-null   object 
 1   Location   2061 non-null   object 
 2   Maker      2061 non-null   object 
 3   Model      2061 non-null   object 
 4   Year       2059 non-null   object 
 5   Colour     2061 non-null   object 
 6   Type       2007 non-null   object 
 7   Distance   1385 non-null   float64
dtypes: float64(1), object(7)
memory usage: 128.9+ KB


In [7]:
#To get some descriptive statistics on our train dataset
train.describe()

Unnamed: 0,Amount (Million Naira)
count,7188.0
mean,11.847999
std,25.318922
min,0.45
25%,3.5
50%,5.65
75%,11.6625
max,456.0


In [8]:
#To get some descriptive statistics on our test dataset
test.describe()

Unnamed: 0,Distance
count,1385.0
mean,103800.668592
std,105986.234512
min,1.0
25%,52352.0
50%,82000.0
75%,120398.0
max,985216.0


# STEP 3: Cleaning our dataset

In [9]:
train.columns

Index(['VehicleID', 'Location', 'Maker', 'Model', 'Year', 'Colour',
       'Amount (Million Naira)', 'Type', 'Distance'],
      dtype='object')

In [10]:
test.columns

Index(['VehicleID', 'Location', 'Maker', 'Model', 'Year', 'Colour', 'Type',
       'Distance'],
      dtype='object')

In [11]:
#Check for missing value on our train dataset
train.isnull().sum()

VehicleID                    0
Location                     0
Maker                        0
Model                        0
Year                        21
Colour                       0
Amount (Million Naira)      17
Type                       197
Distance                  2360
dtype: int64

In [12]:
#Check for missing value on our test dataset
test.isnull().sum()

VehicleID      0
Location       0
Maker          0
Model          0
Year           2
Colour         0
Type          54
Distance     676
dtype: int64

In [13]:
#Remove train rows for missing values and reset numbering

train = train[~train["Amount (Million Naira)"].isnull()].reset_index(drop=True)
train = train[~train["Year"].isnull()].reset_index(drop=True)
train = train[~train["Type"].isnull()].reset_index(drop=True)
train = train[~train["Distance"].isnull()].reset_index(drop=True)

In [14]:
train.isnull().sum()

VehicleID                 0
Location                  0
Maker                     0
Model                     0
Year                      0
Colour                    0
Amount (Million Naira)    0
Type                      0
Distance                  0
dtype: int64

In [15]:
train = train.drop("VehicleID", axis=1)
test = test.drop("VehicleID", axis=1)


In [16]:
train.head()

Unnamed: 0,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance
0,Ibadan,Hyundai,Sonata,2012,Silver,3.5,Nigerian Used,125000
1,Lagos,Lexus,RX 350,2010,Red,9.2,Foreign Used,110852
2,Abuja,Mercedes-Benz,GLE-Class,2017,Blue,22.8,Foreign Used,30000
3,Ibadan,Toyota,Highlander,2002,Red,2.6,Nigerian Used,125206
4,Lagos,Toyota,Sienna,2012,Blue,7.76,Foreign Used,350882


In [17]:
test.head()

Unnamed: 0,Location,Maker,Model,Year,Colour,Type,Distance
0,Abuja,BMW,323i,2008,White,Foreign Used,30524.0
1,Lagos,Toyota,Camry,2013,White,Foreign Used,
2,Lagos,Toyota,Highlander Limited V6,2005,Gold,Foreign Used,
3,Lagos,Toyota,Camry,2011,Gray,Foreign Used,166839.0
4,Lagos,Lexus,ES 350 FWD,2013,Red,Foreign Used,88862.0


In [18]:
#label encoding
cat_features = ["Location","Maker","Year","Colour","Type","Model","Distance"]

for cat_feature in cat_features:
  train[f"{cat_feature}_cat"] = train[cat_feature].astype('category')
  train[f"{cat_feature}_cat"] = train[f"{cat_feature}_cat"].cat.codes

cat_features = ["Location","Maker","Year","Colour","Type", "Model","Distance"]
    
for cat_feature in cat_features:
  test[f"{cat_feature}_cat"] = test[cat_feature].astype('category')
  test[f"{cat_feature}_cat"] = test[f"{cat_feature}_cat"].cat.codes

In [19]:
train.head()

Unnamed: 0,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat,Model_cat,Distance_cat
0,Ibadan,Hyundai,Sonata,2012,Silver,3.5,Nigerian Used,125000,1,16,20,14,2,828,443
1,Lagos,Lexus,RX 350,2010,Red,9.2,Foreign Used,110852,2,25,18,13,1,718,222
2,Abuja,Mercedes-Benz,GLE-Class,2017,Blue,22.8,Foreign Used,30000,0,30,25,2,1,391,1334
3,Ibadan,Toyota,Highlander,2002,Red,2.6,Nigerian Used,125206,1,46,10,13,2,440,447
4,Lagos,Toyota,Sienna,2012,Blue,7.76,Foreign Used,350882,2,46,20,2,1,801,1468


In [20]:
display(test.head())

Unnamed: 0,Location,Maker,Model,Year,Colour,Type,Distance,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat,Model_cat,Distance_cat
0,Abuja,BMW,323i,2008,White,Foreign Used,30524.0,0,2,14,16,1,8,131
1,Lagos,Toyota,Camry,2013,White,Foreign Used,,2,37,19,16,1,123,-1
2,Lagos,Toyota,Highlander Limited V6,2005,Gold,Foreign Used,,2,37,11,7,1,272,-1
3,Lagos,Toyota,Camry,2011,Gray,Foreign Used,166839.0,2,37,17,8,1,123,838
4,Lagos,Lexus,ES 350 FWD,2013,Red,Foreign Used,88862.0,2,20,19,12,1,192,528


In [21]:
# Dropping the reductant train features since we've done our Label encoding
train.drop(["Location","Maker","Year","Colour", "Type", "Model", "Distance"], axis=1, inplace=True)

In [22]:
train.head()

Unnamed: 0,Amount (Million Naira),Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat,Model_cat,Distance_cat
0,3.5,1,16,20,14,2,828,443
1,9.2,2,25,18,13,1,718,222
2,22.8,0,30,25,2,1,391,1334
3,2.6,1,46,10,13,2,440,447
4,7.76,2,46,20,2,1,801,1468


In [23]:
# Dropping the reductant test features since we've done our Label encoding
test.drop(["Location","Maker","Year","Colour", "Type", "Model","Distance"], axis=1, inplace=True)

In [24]:
test.head()

Unnamed: 0,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat,Model_cat,Distance_cat
0,0,2,14,16,1,8,131
1,2,37,19,16,1,123,-1
2,2,37,11,7,1,272,-1
3,2,37,17,8,1,123,838
4,2,20,19,12,1,192,528


# STEP 4: Data Segmentation

In [25]:
y = train["Amount (Million Naira)"] 
x = train.drop("Amount (Million Naira)", axis=1)

In [26]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Step 5: Loading data into the Linear Regression model(Training our model)

In [27]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(x_train, y_train)

LinearRegression()

# STEP 6: Predictions

In [28]:
reg.predict(x_test)

array([21.04038626, -0.59634005,  1.35052696, ...,  6.62479944,
       24.12169985,  0.58434001])

# STEP 7: Evaluate our models

In [29]:
from sklearn.metrics import mean_absolute_error


y_pred = reg.predict(x_test)

print("MAE",mean_absolute_error(y_test,y_pred))

MAE 7.315803011727407


In [43]:
from sklearn.metrics import mean_squared_error

y_pred = reg.predict(x_test)
print("MSE", mean_squared_error(y_test,y_pred))

MSE 277.10423515398713


In [44]:
rmse=mean_squared_error(y_test,y_pred,squared =False)
rmse

16.64644812426925

# Prediction with test dataset

In [45]:
test_pred = reg.predict(test)

In [46]:
test_pred

array([ 7.61573884, 12.45751859,  1.81249891, ..., 18.00259799,
       16.49643809,  9.97411524])


# Export our submission file

In [47]:
sub = pd.read_csv("SampleSubmission.csv")
sub.head()

Unnamed: 0,VehicleID,Amount (Million Naira)
0,VHL18518,1.0
1,VHL17149,1.0
2,VHL10927,1.0
3,VHL12909,1.0
4,VHL12348,1.0


In [48]:
sub["Amount (Million Naira)"] = test_pred

In [49]:
sub

Unnamed: 0,VehicleID,Amount (Million Naira)
0,VHL18518,7.615739
1,VHL17149,12.457519
2,VHL10927,1.812499
3,VHL12909,8.300821
4,VHL12348,13.106978
...,...,...
2056,VHL17903,27.144549
2057,VHL14018,14.931807
2058,VHL17473,18.002598
2059,VHL11480,16.496438


In [50]:
sub.to_csv("FAIC_HACKATHON.csv", index= False)