In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://github.com/YBI-Foundation/Dataset/raw/main/MPG.csv')

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [5]:
df = df.iloc[:, :-3] # Removing attributes which are not required

In [6]:
# Horsepower attribute has 6 missing values, let's fix it
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
imputer.fit(df)
X = imputer.transform(df)
df = pd.DataFrame(X, columns=df.columns)

In [7]:
df['cylinders'].value_counts()

cylinders
4.0    204
8.0    103
6.0     84
3.0      4
5.0      3
Name: count, dtype: int64

In [8]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
count,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.30402,2970.424623,15.56809
std,7.815984,1.701004,104.269838,38.222625,846.841774,2.757689
min,9.0,3.0,68.0,46.0,1613.0,8.0
25%,17.5,4.0,104.25,76.0,2223.75,13.825
50%,23.0,4.0,148.5,93.5,2803.5,15.5
75%,29.0,8.0,262.0,125.0,3608.0,17.175
max,46.6,8.0,455.0,230.0,5140.0,24.8


## Feature Engineering

In [9]:
df['engine_size'] = df['displacement']*df['cylinders']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    float64
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   engine_size   398 non-null    float64
dtypes: float64(7)
memory usage: 21.9 KB


In [10]:
%matplotlib inline
# import matplotlib as plt
# df.hist(bins=50, figsize=(20,15))

## Train-Test Split

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(df, df['cylinders']):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]

In [12]:
train_set.info()
test_set.info()
test_set['cylinders'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Index: 318 entries, 145 to 362
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           318 non-null    float64
 1   cylinders     318 non-null    float64
 2   displacement  318 non-null    float64
 3   horsepower    318 non-null    float64
 4   weight        318 non-null    float64
 5   acceleration  318 non-null    float64
 6   engine_size   318 non-null    float64
dtypes: float64(7)
memory usage: 19.9 KB
<class 'pandas.core.frame.DataFrame'>
Index: 80 entries, 128 to 240
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           80 non-null     float64
 1   cylinders     80 non-null     float64
 2   displacement  80 non-null     float64
 3   horsepower    80 non-null     float64
 4   weight        80 non-null     float64
 5   acceleration  80 non-null     float64
 6   engine_size   80 non-n

cylinders
4.0    41
8.0    21
6.0    17
3.0     1
Name: count, dtype: int64

In [13]:
corr_matrix = df.corr()
corr_matrix['mpg'].sort_values(ascending=False)

mpg             1.000000
acceleration    0.420289
horsepower     -0.773453
cylinders      -0.775396
engine_size    -0.781124
displacement   -0.804203
weight         -0.831741
Name: mpg, dtype: float64

In [14]:
from pandas.plotting import scatter_matrix
# attributes = ['mpg','acceleration','horsepower','engine_size','displacement','weight']
# scatter_matrix(df[attributes], figsize = (12,8))

In [15]:
## Splitting Features and Labels
train_set_features = train_set.drop('mpg', axis=1)
train_set_labels = train_set['mpg'].copy()

## Creating a Pipeline

In [16]:
# We had already fixed the missing values, but we can also implement that preprocessing steps in pipeline itself

from sklearn.pipeline import Pipeline # To create a pipline
from sklearn.preprocessing import StandardScaler # To implement feature scaling
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [17]:
train_set_features_transformed = my_pipeline.fit_transform(train_set_features)

In [18]:
train_set_features_transformed.shape

(318, 6)

## Selecting a Model

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
model = LinearRegression()
# model = DecisionTreeRegressor()
# model = RandomForestRegressor()
model.fit(train_set_features_transformed, train_set_labels)

## Evaluating a Model

In [20]:
from sklearn.metrics import mean_squared_error, r2_score
mpg_predictions = model.predict(train_set_features_transformed)
mse = mean_squared_error(train_set_labels, mpg_predictions)
r2score = r2_score(train_set_labels, mpg_predictions)

In [21]:
mse

15.890271405618043

In [22]:
r2score

0.7297227366974428

## Cross Validation

In [23]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, train_set_features_transformed, train_set_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [24]:
rmse_scores

array([3.69097462, 3.30699851, 5.02836357, 3.75649824, 3.91177455,
       3.71442233, 4.65593255, 2.92768319, 5.08268339, 4.1573812 ])

In [25]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard", scores.std())

In [26]:
print_scores(rmse_scores)

Scores: [3.69097462 3.30699851 5.02836357 3.75649824 3.91177455 3.71442233
 4.65593255 2.92768319 5.08268339 4.1573812 ]
Mean: 4.023271215237653
Standard 0.674802069938568


## Saving the Model

In [27]:
from joblib import dump, load
dump(model,'Mileage_Model.joblib')

['Mileage_Model.joblib']

## Testing 

In [28]:
x_test = test_set.drop("mpg", axis=1)
y_test = test_set["mpg"].copy()
x_test_prepared = my_pipeline.transform(x_test)
final_predictions = model.predict(x_test_prepared)
final_mse = mean_squared_error(y_test,final_predictions)
final_rmse = np.sqrt(final_mse)

In [29]:
final_rmse

4.099812014751975

In [30]:
r2_score(y_test,final_predictions)

0.7558360205348842

## Using the Model

In [31]:
x_test_prepared[0]

array([ 0.32260746,  0.56746699, -0.1001386 ,  0.43191963,  0.492422  ,
        0.29530153])

In [32]:
from joblib import dump, load
model = load("Mileage_Model.joblib")
features = np.array([[ 0.32260746,  0.56746699, -0.1001386 ,  0.43191963,  0.492422  ,
        0.29530153]])
model.predict(features)

array([19.17832857])