# ----------- DAY 3 OF AI CLASS AT ALPHA DEVELOPER HUB

# SUPERVISED LEARNING INTRODUCTION TO REGRESSION

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn
import sklearn

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

### Step#2 Loading the dataset

In [2]:
# TODO: Get the datset dataset/AI_Invasion_In-Class_Dataset.xlsx form your AI Invasion
# Study Pack
# Note: You can use pandas read_excel to read file with xlsx format

df = pd.read_excel("dataset/AI_Invasion_In-Class_Dataset.xlsx") 

df.head()

Unnamed: 0,Location,Maker,Model,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,GLA 250,2015.0,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,Accent,2013.0,Red,1.55,Nigerian Used,
2,Lagos,Lexus,GX 460 Premium,2011.0,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,ES 350,2011.0,Gray,4.95,Foreign Used,
4,Ibadan,Toyota,Verso 1.6,2009.0,Silver,1.69,Nigerian Used,118906.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Location            4487 non-null   object 
 1   Maker               4487 non-null   object 
 2   Model               4487 non-null   object 
 3   Year                4487 non-null   float64
 4   Colour              4487 non-null   object 
 5   Amount (Million ₦)  4487 non-null   float64
 6   Type                4487 non-null   object 
 7   Distance_Km         2932 non-null   float64
dtypes: float64(3), object(5)
memory usage: 280.6+ KB


In [4]:
df.describe()

Unnamed: 0,Year,Amount (Million ₦),Distance_Km
count,4487.0,4487.0,2932.0
mean,2011.09561,11.309795,101038.3
std,4.823362,20.585915,115091.4
min,1982.0,0.42,1.0
25%,2008.0,3.6,52378.5
50%,2011.0,5.7,79000.0
75%,2014.0,12.0,109939.2
max,2022.0,454.0,1785448.0


### Step#3 Clean the dataset

In [5]:
df.columns

Index(['Location', 'Maker', 'Model', 'Year', 'Colour', 'Amount (Million ₦)',
       'Type', 'Distance_Km'],
      dtype='object')

In [6]:
# Check for missing value
df.isnull().sum()

Location                 0
Maker                    0
Model                    0
Year                     0
Colour                   0
Amount (Million ₦)       0
Type                     0
Distance_Km           1555
dtype: int64

In [7]:
# fill up missing values in Distance_Km will the mean
mean_value = df["Distance_Km"].mean()
print(mean_value)

df["Distance_Km"].fillna(mean_value, inplace=True)

101038.32128240108


In [8]:
# Check and make sure all missing valuen have been filled
df.isnull().sum()

Location              0
Maker                 0
Model                 0
Year                  0
Colour                0
Amount (Million ₦)    0
Type                  0
Distance_Km           0
dtype: int64

In [10]:
# The main of this section is to rename the different 
# class in our categorigal feature that were not properly named.
# or chanage the data type of a column

cat_features = {
    "Location",
    "Model",
    "Maker",
    "Year",
    "Colour",
    "Type",
}

for cat_feature in cat_features:
  print(cat_feature, df[cat_feature].unique(), sep=":")
  print("#"*50)

Colour:['Brown' 'Red' 'White' 'Gray' 'Silver' 'Black' 'Blue' 'Gold' 'Green'
 'Beige' 'Purple' 'Orange' 'Burgandy' 'Ivory' 'Pink' 'Pearl' 'Yellow'
 'Luury' 'Teal']
##################################################
Location:['Abuja' 'Lagos' 'Ibadan']
##################################################
Model:['GLA 250' 'Accent' 'GX 460 Premium' 'ES 350' 'Verso 1.6' 'Corolla 1.8 LE'
 'E350' 'GL-Class' 'RX 350 AWD' 'Land Cruiser 3.5 V6' 'Matrix'
 'Land Cruiser' 'C350' 'Corolla' 'IS 250 4WD' 'Venza V6' 'CX-7' 'RX 350'
 'Highlander Limited 4x4' 'RX' 'RX 350 F Sport AWD' 'Camry'
 'Land Cruiser 5.7 V8 VX-S' 'GLK-Class' 'Avalon' 'GS 300' 'Accord'
 '4-Runner' 'Civic' 'ES 330 Sedan' 'Corolla LE (1.8L 4cyl 2A)' 'Santa Fe'
 'Highlander' 'Elantra' '4-Runner Limited V6' 'Venza Limited FWD V6'
 'M Class ML 350 4Matic' 'M Class' 'Hyundai Kona' 'C300'
 'Camry XLE V6 FWD' 'Range Rover Velar' 'IS 250' 'Highlander Limited'
 'RAV4 Limited FWD' 'Cayenne' 'RX 330' 'RDX' 'Corolla XSE (1.8L 4cyl 2A)'
 'Micra' 'V

In [11]:
# Drop the Model feature
df.drop("Model", axis=1, inplace=True)
df.head()

Unnamed: 0,Location,Maker,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,2015.0,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,2013.0,Red,1.55,Nigerian Used,101038.321282
2,Lagos,Lexus,2011.0,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,2011.0,Gray,4.95,Foreign Used,101038.321282
4,Ibadan,Toyota,2009.0,Silver,1.69,Nigerian Used,118906.0


In [12]:
# Label Encoding
cat_features = ["Location","Maker","Year","Colour","Type"]

for cat_feature in cat_features:
  df[f"{cat_feature}_cat"] = df[cat_feature].astype('category')
  df[f"{cat_feature}_cat"] = df[f"{cat_feature}_cat"].cat.codes


# Read more on Pandas get_dummies

df.head()

Unnamed: 0,Location,Maker,Year,Colour,Amount (Million ₦),Type,Distance_Km,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat
0,Abuja,Mercedes-Benz,2015.0,Brown,14.5,Foreign Used,50000.0,0,26,22,3,1
1,Abuja,Hyundai,2013.0,Red,1.55,Nigerian Used,101038.321282,0,14,20,14,2
2,Lagos,Lexus,2011.0,White,14.0,Foreign Used,85000.0,2,23,18,17,1
3,Lagos,Lexus,2011.0,Gray,4.95,Foreign Used,101038.321282,2,23,18,6,1
4,Ibadan,Toyota,2009.0,Silver,1.69,Nigerian Used,118906.0,1,44,16,15,2


In [13]:
# Drop the reductant features since Label encoding have been done
df.drop(["Location","Maker","Year","Colour", "Type"], axis=1, inplace=True)
df.head()

Unnamed: 0,Amount (Million ₦),Distance_Km,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat
0,14.5,50000.0,0,26,22,3,1
1,1.55,101038.321282,0,14,20,14,2
2,14.0,85000.0,2,23,18,17,1
3,4.95,101038.321282,2,23,18,6,1
4,1.69,118906.0,1,44,16,15,2


### Step#4 Perform data segmentation

In [14]:
y = df["Amount (Million ₦)"] # Target
X = df.drop("Amount (Million ₦)", axis=1)

In [15]:
# trainging and test varible for machine learning
# text_size as in 0.1 1 but in % of 100
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Step#5 Load your data into the Linear Regression model i.e Train your model

In [16]:
# ------- from sklearn we only need linear_model with come handy for LinaerRegression
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
# ------ here we used our training variable that is split before at step 4
reg.fit(X_train, y_train)

LinearRegression()

## Step#6 Make predictions

In [17]:
reg.predict(X_test)

array([19.25764585,  8.31560181,  3.96015243, ...,  7.04487268,
        2.86909433,  6.32876847])

## Step#7 Evaluate your model

In [18]:
# -----here also we import metrics from  sklearn library 
from sklearn.metrics import mean_absolute_error

# we are using mean_absolute_error because 
# this is a regression model
# --- we reasign t_test to y_pred and call linearRegression to predict()
y_pred = reg.predict(X_test)

print("MAE",mean_absolute_error(y_test,y_pred))

MAE 7.998570333392069


# Other Machine Learning Algorithms

## Decision Tree

In [19]:
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
y_pred = dt_reg.predict(X_test)

print("MAE",mean_absolute_error(y_test,y_pred))

MAE 5.1622202464736455


## SVM

In [20]:
from sklearn.svm import SVR

sv_reg = SVR()
sv_reg.fit(X_train, y_train)
y_pred = sv_reg.predict(X_test)
print("MAE",mean_absolute_error(y_test,y_pred))

MAE 6.828764109560064


## Random Forest (Activity)