# Prepare the environment

## For Manipulation

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
pd.__version__

'1.3.5'

In [13]:
#!pip install ipywidgets==7.7.0

In [12]:
# others libraries for plotting
from ipywidgets import interact, interactive, fixed, interact_manual

## For Model

In [42]:
# for randomly split our data into training and testing data
from sklearn.model_selection import train_test_split

# for using linear models
from sklearn.linear_model import LinearRegression

# for perform cross-validation
from sklearn.model_selection import cross_val_score
# You can also use the function 'cross_val_predict' to predict the output
from sklearn.model_selection import cross_val_predict

## Read our dataframe

In [5]:
path_file = 'cars_to_model_evaluation_and_refine.csv'
path_file

'cars_to_model_evaluation_and_refine.csv'

In [6]:
# read our data frame
df_cars = pd.read_csv(path_file)
df_cars

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,0,0,3,122,alfa-romero,std,two,convertible,rwd,front,...,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,1,1,3,122,alfa-romero,std,two,convertible,rwd,front,...,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,2,2,1,122,alfa-romero,std,two,hatchback,rwd,front,...,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,3,3,2,164,audi,std,four,sedan,fwd,front,...,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,4,4,2,164,audi,std,four,sedan,4wd,front,...,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,196,196,-1,95,volvo,std,four,sedan,rwd,front,...,9.5,114.0,5400.0,23,28,16845.0,10.217391,Medium,0,1
197,197,197,-1,95,volvo,turbo,four,sedan,rwd,front,...,8.7,160.0,5300.0,19,25,19045.0,12.368421,High,0,1
198,198,198,-1,95,volvo,std,four,sedan,rwd,front,...,8.8,134.0,5500.0,18,23,21485.0,13.055556,Medium,0,1
199,199,199,-1,95,volvo,turbo,four,sedan,rwd,front,...,23.0,106.0,4800.0,26,27,22470.0,9.038462,Medium,1,0


In [8]:
# get only the numerical columns
df_cars = df_cars._get_numeric_data()
df_cars.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,...,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,diesel,gas
0,0,0,3,122,88.6,0.811148,0.890278,48.8,2548,130,...,2.68,9.0,111.0,5000.0,21,27,13495.0,11.190476,0,1
1,1,1,3,122,88.6,0.811148,0.890278,48.8,2548,130,...,2.68,9.0,111.0,5000.0,21,27,16500.0,11.190476,0,1
2,2,2,1,122,94.5,0.822681,0.909722,52.4,2823,152,...,3.47,9.0,154.0,5000.0,19,26,16500.0,12.368421,0,1
3,3,3,2,164,99.8,0.84863,0.919444,54.3,2337,109,...,3.4,10.0,102.0,5500.0,24,30,13950.0,9.791667,0,1
4,4,4,2,164,99.4,0.84863,0.922222,54.3,2824,136,...,3.4,8.0,115.0,5500.0,18,22,17450.0,13.055556,0,1


In [9]:
df_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         201 non-null    int64  
 1   Unnamed: 0.1       201 non-null    int64  
 2   symboling          201 non-null    int64  
 3   normalized-losses  201 non-null    int64  
 4   wheel-base         201 non-null    float64
 5   length             201 non-null    float64
 6   width              201 non-null    float64
 7   height             201 non-null    float64
 8   curb-weight        201 non-null    int64  
 9   engine-size        201 non-null    int64  
 10  bore               201 non-null    float64
 11  stroke             197 non-null    float64
 12  compression-ratio  201 non-null    float64
 13  horsepower         201 non-null    float64
 14  peak-rpm           201 non-null    float64
 15  city-mpg           201 non-null    int64  
 16  highway-mpg        201 non

## Functions for plotting

In [14]:
def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))

    ax1 = sns.distplot(RedFunction, hist=False, color="r", label=RedName)
    ax2 = sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)

    plt.title(Title)
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Proportion of Cars')

    plt.show()
    plt.close()

In [17]:
def PollyPlot(xtrain, xtest, y_train, y_test, lr,poly_transform):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
        
    # x_train, y_train: training data 
    # x_test, y_test: testing data 
    # lr: linear regression object 
    # poly_transform: polynomial transformation object
 
    xmax = max([xtrain.values.max(), xtest.values.max()])

    xmin = min([xtrain.values.min(), xtest.values.min()])

    x = np.arange(xmin, xmax, 0.1)

    plt.plot(xtrain, y_train, 'ro', label='Training Data')
    plt.plot(xtest, y_test, 'go', label='Test Data')
    plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
    
    plt.ylim([-10000, 60000])
    plt.ylabel('Price')
    plt.legend()

# Training and Testing

An important step to testing your model is split your data

In [29]:
# first we create dataframes to our price data and the others

y_data = df_cars['price']

x_data = df_cars.drop(['price'], axis = 1)

In [30]:
# we randomly split our data into training and testing data

# test_size = proportion of data to split for example 0.10 = 10%
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=1)

# to compare our testing and training data
print('Number of training data: ', x_train.shape[0])
print('Number of test data: ', x_test.shape[0])

Number of training data:  180
Number of test data:  21


In [31]:
# we need to prove to 40% test data and random_state = 0

x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(x_data, y_data, test_size=0.40, random_state=0)

print('Number of training data: {}'.format(x_train_1.shape[0]))
print('Number of test data: {}'.format(x_test_1.shape[0]))

Number of training data: 120
Number of test data: 81


## Linear Regression Model

In [32]:
# create the linear regression object
lm = LinearRegression()

# fit the model using the feature 'horsepower'
lm.fit(x_train[['horsepower']], y_train)

LinearRegression()

In [34]:
# calculate the r^2 using the test data
lm.score(x_test[['horsepower']], y_test)

0.36358755750788263

In [35]:
# calculate the r^2(r_score) using the training data
lm.score(x_train[['horsepower']], y_train)

0.6619724197515104

> We can appreciate the score of test data is much smaller compared to the training data

In [40]:
# Other compare: Find the R^2 on the test data using 40% of the dataset for testing

# create the linear regression object
lm1 = LinearRegression()

# fit out model using the training data of 40%
lm1.fit(x_train_1[['horsepower']], y_train_1)

# calculate R^2 using the test data of 40%
lm1.score(x_test_1[['horsepower']], y_test_1)

0.7139364665406973

> Sometimes you do not have sufficient testing data; as a result, you may want to perform cross-validation.

## Cross-Validation Score

To evaluate a score by cross-validation

In [46]:
# create the cross validation score

# lm = The object to use to fit the data.
# x_data[['horsepower']] = The data to fit
# y_data = The target variable to try to predict
# cv = Splitting strategy - An iterable that generates (train, test) splits as arrays of indices = folds
Rcross = cross_val_score(lm, x_data[['horsepower']], y_data, cv = 4)
Rcross

array([0.7746232 , 0.51716687, 0.74785353, 0.04839605])

In [48]:
# We can calculate the average and standard deviation of our estimate
print('The mean of the fods are: ', Rcross.mean())
print('The standar desviation of the fods are: ', Rcross.std())

The mean of the fods are:  0.522009915042119
The standar desviation of the fods are:  0.291183944475603


### You can also predict

Use the function 'cross_val_predict' to predict the output

In [49]:
yhat = cross_val_predict(lm, x_data[['horsepower']], y_data, cv = 4)
yhat[:4]

array([14141.63807508, 14141.63807508, 20814.29423473, 12745.03562306])

## Overfitting, Underfitting and Model Selection

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fabb4e41-fcd2-4421-bbfb-772986975631' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>