## Import libraries

In [1]:
# this code imports libraries into the program
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
# this code ignores all error messages the table would have returned
import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
# Loading train data
train = pd.read_csv("Train.csv")
train.sample(10)

Unnamed: 0,VehicleID,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance
3328,VHL14620,Abuja,Lexus,RX 350,2009,Black,5.63,Nigerian Used,80188.0
2669,VHL18876,Abuja,Ford,Edge,2015,Blue,9.9,Foreign Used,31594.0
1109,VHL14966,Lagos,Mercedes-Benz,M Class ML 350 4Matic,2011,White,6.75,Foreign Used,
1915,VHL15126,Abuja,Toyota,Camry XSE V6 (3.5L V6 8A),2019,White,16.0,Foreign Used,
3896,VHL12909,Abuja,Honda,Civic Sedan EX,2013,Gray,3.3,Nigerian Used,82547.0
4540,VHL13106,Abuja,Mercedes-Benz,GLK-Class,2009,Red,6.7,Foreign Used,
4284,VHL10677,Lagos,Mercedes-Benz,M Class ML 350 4Matic,2012,Gray,11.65,Foreign Used,
6040,VHL15223,Ibadan,Toyota,Venza AWD,2011,Gray,5.85,Nigerian Used,66525.0
2704,VHL12860,Lagos,Toyota,Camry 2.4 LE,2008,Black,4.1,Foreign Used,
4026,VHL14563,Lagos,Land Rover,Range Rover,2018,Black,45.0,Foreign Used,30092.0


### Exploratory Data Analysis

In [3]:
# this code describes the dataset available
train.shape
rows, columns = train.shape
print(f'We have {rows} rows and {columns} columns')

We have 7205 rows and 9 columns


In [4]:
# this code returns instances with null value
train.isnull().sum()

VehicleID                    0
Location                     0
Maker                        0
Model                        0
Year                        21
Colour                       0
Amount (Million Naira)      17
Type                       197
Distance                  2360
dtype: int64

In [5]:
# returns data on the datatypes and non-null count in the dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7205 entries, 0 to 7204
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   VehicleID               7205 non-null   object 
 1   Location                7205 non-null   object 
 2   Maker                   7205 non-null   object 
 3   Model                   7205 non-null   object 
 4   Year                    7184 non-null   object 
 5   Colour                  7205 non-null   object 
 6   Amount (Million Naira)  7188 non-null   float64
 7   Type                    7008 non-null   object 
 8   Distance                4845 non-null   object 
dtypes: float64(1), object(8)
memory usage: 506.7+ KB


## Feature Engineering

In [6]:
# eliminates irrelevant features from the dataset
train_red = train[['Location','Maker','Model','Year','Type','Amount (Million Naira)','Colour']]
train_red.head()

Unnamed: 0,Location,Maker,Model,Year,Type,Amount (Million Naira),Colour
0,Abuja,Honda,Accord Coupe EX V-6,2011,Nigerian Used,2.2,Silver
1,Ibadan,Hyundai,Sonata,2012,Nigerian Used,3.5,Silver
2,Lagos,Lexus,RX 350,2010,Foreign Used,9.2,Red
3,Abuja,Mercedes-Benz,GLE-Class,2017,Foreign Used,22.8,Blue
4,Ibadan,Toyota,Highlander,2002,Nigerian Used,2.6,Red


In [7]:
# This code drops the null values in a dataset
train_red.dropna(inplace=True)
# tells you if any value is null in the dataset. 'train_red.isna().sum()' returns the number of instances with null values
train_red.isna().any()

Location                  False
Maker                     False
Model                     False
Year                      False
Type                      False
Amount (Million Naira)    False
Colour                    False
dtype: bool

In [8]:
# this returns the number of unique values for each feature
for column in train_red.columns:
    print(column, 'has', train_red[column].nunique(),' unique features')

Location has 3  unique features
Maker has 54  unique features
Model has 1180  unique features
Year has 33  unique features
Type has 3  unique features
Amount (Million Naira) has 679  unique features
Colour has 19  unique features


## Data Processing

In [9]:
# sklearn contains ML algorithms necessary for ML operations so we are importing various tools from the library
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import LabelEncoder,PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [10]:
train_red.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6972 entries, 0 to 7204
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Location                6972 non-null   object 
 1   Maker                   6972 non-null   object 
 2   Model                   6972 non-null   object 
 3   Year                    6972 non-null   object 
 4   Type                    6972 non-null   object 
 5   Amount (Million Naira)  6972 non-null   float64
 6   Colour                  6972 non-null   object 
dtypes: float64(1), object(6)
memory usage: 435.8+ KB


In [11]:
cat_col = ['Maker', 'Model', 'Type', 'Colour']

for col in cat_col:
    lab = LabelEncoder()#CountEncoder(normalize=True)
    lab.fit(train_red[col])
    train_red[col] = lab.transform(train_red[col])

In [12]:
# replace the comma in the Year colum with blank
train_red['Year'] = train_red['Year'].str.replace(',', '')

In [13]:
train_red['Year'] = pd.to_numeric(train_red['Year'])

In [14]:
train_red.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6972 entries, 0 to 7204
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Location                6972 non-null   object 
 1   Maker                   6972 non-null   int32  
 2   Model                   6972 non-null   int32  
 3   Year                    6972 non-null   int64  
 4   Type                    6972 non-null   int32  
 5   Amount (Million Naira)  6972 non-null   float64
 6   Colour                  6972 non-null   int32  
dtypes: float64(1), int32(4), int64(1), object(1)
memory usage: 326.8+ KB


In [15]:
# dummy encoding
Locat = pd.get_dummies(train_red['Location'], drop_first=True)
Locat.head()

Unnamed: 0,Ibadan,Lagos
0,0,0
1,1,0
2,0,1
3,0,0
4,1,0


In [16]:
train_red.head()

Unnamed: 0,Location,Maker,Model,Year,Type,Amount (Million Naira),Colour
0,Abuja,16,116,2011,2,2.2,15
1,Ibadan,18,1019,2012,2,3.5,15
2,Lagos,28,881,2010,1,9.2,14
3,Abuja,33,490,2017,1,22.8,2
4,Ibadan,51,548,2002,2,2.6,14


In [17]:
all_data = pd.concat([train_red.drop('Location',axis=1), Locat], axis=1).reset_index(drop = True)
all_data.sample(10)

Unnamed: 0,Maker,Model,Year,Type,Amount (Million Naira),Colour,Ibadan,Lagos
6764,33,90,2019,1,150.0,17,0,0
641,33,479,2016,1,35.0,17,0,1
3928,33,210,2010,1,4.1,1,0,0
245,28,881,2010,1,8.8,7,0,1
1687,28,881,2010,1,8.1,15,0,0
6594,0,868,2016,1,9.2,17,0,1
4337,23,704,2005,2,1.45,7,0,0
6281,28,881,2007,2,3.6,7,0,0
2297,51,1098,2014,1,11.5,1,0,1
4686,51,342,2006,1,3.0,17,0,0


In [18]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6972 entries, 0 to 6971
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Maker                   6972 non-null   int32  
 1   Model                   6972 non-null   int32  
 2   Year                    6972 non-null   int64  
 3   Type                    6972 non-null   int32  
 4   Amount (Million Naira)  6972 non-null   float64
 5   Colour                  6972 non-null   int32  
 6   Ibadan                  6972 non-null   uint8  
 7   Lagos                   6972 non-null   uint8  
dtypes: float64(1), int32(4), int64(1), uint8(2)
memory usage: 231.6 KB


In [19]:
all_data.isnull().sum()
#to check if there is any null value in the data set

Maker                     0
Model                     0
Year                      0
Type                      0
Amount (Million Naira)    0
Colour                    0
Ibadan                    0
Lagos                     0
dtype: int64

In [20]:
Features = all_data.drop('Amount (Million Naira)', axis=1)
Target = all_data['Amount (Million Naira)']
X = Features
Y = Target

In [21]:
print(X.head())

   Maker  Model  Year  Type  Colour  Ibadan  Lagos
0     16    116  2011     2      15       0      0
1     18   1019  2012     2      15       1      0
2     28    881  2010     1      14       0      1
3     33    490  2017     1       2       0      0
4     51    548  2002     2      14       1      0


In [22]:
print(Y.head())

0     2.2
1     3.5
2     9.2
3    22.8
4     2.6
Name: Amount (Million Naira), dtype: float64


In [23]:
random_state = 42
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=random_state)

## Modelling

In [24]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [25]:
model = LinearRegression()
model.fit(X_train, Y_train)
Accuracy=model.score(X_test, Y_test)
print(f"The accuracy of the model is {Accuracy}")

The accuracy of the model is 0.26890330797940276


In [26]:
model_2 = DecisionTreeRegressor()
model_2.fit(X_train, Y_train)
Accuracy_2=model_2.score(X_test, Y_test)
print(f"The accuracy of the model is {Accuracy_2}")

The accuracy of the model is 0.7635441361154418


In [27]:
model_3 = RandomForestRegressor()
model_3.fit(X_train, Y_train)
Accuracy_3=model_3.score(X_test, Y_test)
print(f"The accuracy of the model is {Accuracy_3}")

The accuracy of the model is 0.7925638674656927


## Marketing Prediction

In [28]:
X_test.head()

Unnamed: 0,Maker,Model,Year,Type,Colour,Ibadan,Lagos
132,28,881,2010,1,7,1,0
5571,16,115,2003,2,8,0,1
2168,16,97,2010,2,2,0,0
4086,32,252,2014,2,1,0,0
4787,51,547,2010,2,17,0,1


In [29]:
Y_train

3909     5.50
5957     3.26
652     11.50
33       2.40
3078     3.35
        ...  
3772     6.00
5191     1.24
5226     2.25
5390     7.50
860     17.50
Name: Amount (Million Naira), Length: 5577, dtype: float64

In [30]:
train_red.head()

Unnamed: 0,Location,Maker,Model,Year,Type,Amount (Million Naira),Colour
0,Abuja,16,116,2011,2,2.2,15
1,Ibadan,18,1019,2012,2,3.5,15
2,Lagos,28,881,2010,1,9.2,14
3,Abuja,33,490,2017,1,22.8,2
4,Ibadan,51,548,2002,2,2.6,14


In [31]:
pred = model_2.predict(np.array([[16, 116, 2011, 2, 15, 0, 0]]))
print(pred)

[2.2]


In [32]:
pred = model.predict(np.array([[16, 116, 2011, 2, 15, 0, 0]]))
print(pred)

[3.96421511]


In [33]:
pred = model_3.predict(np.array([[16, 116, 2011, 2, 15, 0, 0]]))
print(pred)

[2.4176]


In [34]:
pred = model_2.predict(np.array([[28, 881, 2010, 1, 9, 1, 0]]))
print(f'The price of the car is {pred} million naira')

The price of the car is [9.] million naira


## Saving the Model

In [35]:
# pickle is for small models
# Joblib is for larger models
import pickle 
import joblib

In [36]:
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [37]:
# for pickle
with open('model_2.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

In [38]:
# for joblib
dump(model_2, 'model_2.joblib')
loaded = load('model_2.joblib')

NameError: name 'dump' is not defined

## Conclusion