# Supervised learning Regression


In [1]:
#  import library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error,r2_score
from sklearn import linear_model
from sklearn.metrics import confusion_matrix

In [2]:
# import dataset
mpg_df = pd.read_csv('auto-mpg.csv')

In [3]:
# taking a look at the dataset's head
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [5]:
# checking for any duplicated rows
mpg_df.duplicated().sum()

0

In [6]:
# checking columns' unique values
mpg_df.nunique()

mpg             129
cylinders         5
displacement     82
horsepower       94
weight          351
acceleration     95
model year       13
origin            3
car name        305
dtype: int64

In [7]:
mpg_df.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [8]:
(mpg_df['horsepower'] == '?').sum()

6

It seems like we have 6 values in horsepower column containing ?, and that is what is giving us the object data type instead of int.

#Exploring Summary
Our dataset has a total of 398 records and 9 columns.
We have no NaNs in our dataset nor duplicated rows.
horsepower column have inconsistant data type that needs to be handled and casted to int.
origin would need to be parsed and casted into a categorical datatype.
No columns would need to be dropped.

Data Cleaning
Here, we'd perform cleaning operations (dropping rows, mapping columns, converting data types). All of which would help us reach a more accurate result in creating meaningful and informative visualizations.

Cleaning horsepower column
We'd drop each row that contains a ? on the horsepower column.

In [9]:
# droping '?' values
mpg_df = mpg_df[mpg_df.horsepower != '?']


In [10]:
# confirming changes
(mpg_df.horsepower == '?').sum()

0

In [11]:
#casting horsepower column to integer
mpg_df.horsepower = mpg_df.horsepower.astype(int)


In [12]:
# confirming changes
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int32  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(3), int32(1), int64(4), object(1)
memory usage: 29.1+ KB


In [13]:
predictors = mpg_df.iloc[:,1:7]
target = mpg_df.iloc[:,0]

In [14]:
predictors_train,predictors_test,target_train,target_test = train_test_split(predictors,
                                            target,test_size=0.3,random_state=123)

In [15]:
# called the Regression model 
lm = linear_model.LinearRegression()

In [16]:
# Train the model 
model = lm.fit(predictors_train,target_train)
model

LinearRegression()

In [19]:
prediction = model.predict(predictors_test)
prediction


array([17.6100512 , 31.74230171, 14.10959163, 25.32438169, 31.21577179,
       16.06486159, 29.8691717 , 20.93044248, 17.65070348, 33.47698479,
       15.08065052, 24.49618189, 12.81518279, 30.56639904, 15.73971701,
       22.29360779, 29.21699722,  7.04278921, 12.37178125, 13.52470419,
       22.21031335, 28.81738259, 29.57341674, 35.35992213, 34.90670517,
       15.76013146, 26.87093144, 31.93421609, 22.78815378, 27.35848547,
       23.42021495, 32.01903269, 16.91343699, 20.94876604, 27.54896062,
       28.45210066, 27.10657051, 27.96542931, 26.30141677, 11.68676823,
       17.57477229, 23.67183826, 25.13235343, 22.35402203, 20.31547638,
       19.65281562, 29.01082047, 29.74158774, 24.20415222, 20.34557319,
       21.10896258, 33.65728062, 16.12910145, 22.44036083, 23.48289417,
       14.78013255,  7.40135072, 24.08054663, 19.33028782, 30.23324156,
       16.19413757,  6.24512791, 24.41369359, 27.35443334, 20.47522761,
       33.74746854, 30.86233692, 24.63251989, 26.63175577, 23.00

In [20]:
mean_squared_error (target_test,prediction)


13.628322750797638

In [21]:
r2_score(target_test,prediction)

0.7611620501999117