In [2]:
import pandas as pd
df = pd.read_excel('ToyotaCorolla.xlsx', sheet_name='data')

In [3]:
df.shape

(1436, 39)

In [4]:
df.Fuel_Type.unique()

array(['Diesel', 'Petrol', 'CNG'], dtype=object)

In [None]:
#Get dummies for fuel type 
df = pd.get_dummies(df, columns=['Fuel_Type'])
df.head()

In [None]:
df.info()

In [None]:
y = df['Price']
X = df.drop(columns=['Price', 'Color', 
                     'Id', "Model",  
                     'Mfg_Month', 'Mfg_Year',
                    'Fuel_Type_Petrol'])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                  test_size = .4,
                                                  random_state=20)

<h3> The process of supervised machine learning</h3>
<img src='train_test_process.jpg' >

In [None]:
import math
from statistics import mean
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
regr = LinearRegression()
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)
print('Average error: %.2f' %mean(y_test - y_pred))
print('Mean absolute error: %.2f' %mean_absolute_error(y_test, y_pred))
print('Mean absolute error: %.2f' %(mean(abs(y_test - y_pred))))
print("Root mean squared error: %.2f"
      % math.sqrt(mean_squared_error(y_test, y_pred)))
print('percentage absolute error: %.2f' %mean(abs((y_test - y_pred)/y_test)))
print('percentage absolute error: %.2f' %(mean(abs(y_test - y_pred))/mean(y_test)))
print('R-squared: %.2f' % r2_score(y_test, y_pred))

In [None]:
import numpy as np
coefficients = np.column_stack((X.columns, regr.coef_)) #join column names and coeffcients
print('Coefficients: \n', coefficients)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
#plot predicted values vs residual
#use blue for residuals of training data
plt.scatter(regr.predict(x_train), y_train - regr.predict(x_train), c='b') 
#use green for residuals of test data
plt.scatter(y_test, y_test - y_pred, c='g') 
plt.ylabel('residuals')
plt.xlabel('actual value')
plt.show()

Below we use recursive feature elimination to select the best model with 5 features.<br> 
the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.
<a href='http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE'>more information</a>

In [None]:
#need to normalize data before using RFE

from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_normalized = pd.DataFrame(scaler.transform(x_train), columns = x_train.columns)
x_test_normalized = pd.DataFrame(scaler.transform(x_test), columns = x_test.columns)

In [None]:
from sklearn.feature_selection import RFE
regr = LinearRegression()
#parameters: estimator, n_features_to_select=None, step=1
selector = RFE(regr, 5, step=1) 
selector.fit(x_train_normalized, y_train)
selector.support_

In [None]:
selector.ranking_

In [None]:
selector.estimator_.coef_

In [None]:
def show_best_model(support_array, columns, model, test_data, test_labels):
    y_pred = model.predict(test_data.iloc[:, support_array])
    r2 = r2_score(test_labels, y_pred)
    n = len(y_pred) #size of test set
    p = len(model.coef_) #number of features
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    print('Adjusted R-squared: %.2f' % adjusted_r2)
    j = 0;
    for i in range(len(support_array)):
        if support_array[i] == True:
            print(columns[i], model.coef_[j])
            j +=1

In [None]:
show_best_model(selector.support_, x_train.columns, selector.estimator_, x_test_normalized, y_test)

<B>Assignment: Write code that examines feature sets of size 1 to 10 and compares their adjusted r-squared values using a for loop. Select the best regression model based on the improvement made to the r-squared. You can ignore any improvement <= 1%. After finding the best model, run it again using original (not normalized) data. Submit your notebook containing your code and the best model with unnormalized coefficients.