In [2]:
import pandas as pd
import numpy as np

cars = pd.read_csv('imports-85.data')

cars.head()

# Renaming the column names of the data sets according to the documentation
cars.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [3]:
# Selecting the numeric values for predictions
numeric_features = ['normalized-losses', 'wheel-base', 'length','width', 'height', 'curb-weight', 'engine-size',
                   'bore', 'stroke','compression-rate', 'horsepower', 'peak-rpm', 'city-mpg','highway-mpg', 'price']

cars_numeric = cars[numeric_features]

cars_numeric.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,?,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500
1,?,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500
2,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950
3,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450
4,?,99.8,177.3,66.3,53.1,2507,136,3.19,3.4,8.5,110,5500,19,25,15250


# Data Cleaning

In [4]:
# Replacing all the '?' values of the normalized-losses columns with NaN
cars_numeric = cars_numeric.replace('?', np.nan)
cars_numeric.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500
1,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500
2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950
3,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450
4,,99.8,177.3,66.3,53.1,2507,136,3.19,3.4,8.5,110,5500,19,25,15250


In [5]:
# cast all columns to float
cars_numeric = cars_numeric.astype('float')

In [6]:
# Number of rows having missing values for the normalized-losses columns
cars_numeric['normalized-losses'].isnull().sum()

40

In [7]:
# looking at other columns if there are missing values there
cars_numeric.isnull().sum()

normalized-losses    40
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [8]:
# Removing the rows that have missing values in their price columns, since, we
# are trying to predic the price
cars_numeric = cars_numeric.dropna(subset=['price'])
cars_numeric.isnull().sum()

normalized-losses    36
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [9]:
# Then we fill out the missing values in other columns with the average of the
# individual columns
cars_numeric.replace([np.inf, -np.inf], np.nan)
cars_numeric = cars_numeric.fillna(cars_numeric.mean())

In [10]:
cars_numeric.isnull().sum()
cars_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 203
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   normalized-losses  200 non-null    float64
 1   wheel-base         200 non-null    float64
 2   length             200 non-null    float64
 3   width              200 non-null    float64
 4   height             200 non-null    float64
 5   curb-weight        200 non-null    float64
 6   engine-size        200 non-null    float64
 7   bore               200 non-null    float64
 8   stroke             200 non-null    float64
 9   compression-rate   200 non-null    float64
 10  horsepower         200 non-null    float64
 11  peak-rpm           200 non-null    float64
 12  city-mpg           200 non-null    float64
 13  highway-mpg        200 non-null    float64
 14  price              200 non-null    float64
dtypes: float64(15)
memory usage: 25.0 KB


In [11]:
# Normalizing all the columns except the price
price = cars_numeric['price']
cars_numeric_normalized = (cars_numeric - cars_numeric.mean()) / (cars_numeric.std())
# cars_numeric_normalized = (cars_numeric - cars_numeric.min())/(cars_numeric.max() - cars_numeric.min())
cars_numeric_normalized['price'] = price
cars_numeric_normalized.head(200)

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,0.000000,-1.697177,-0.439616,-0.855008,-2.055427,-0.014857,0.075389,0.521295,-1.845608,-0.291493,0.204013,-0.246603,-0.652936,-0.542680,16500.0
1,0.000000,-0.720075,-0.245239,-0.189262,-0.572999,0.515422,0.603594,-2.420296,0.668901,-0.291493,1.352043,-0.246603,-0.963857,-0.689152,16500.0
2,1.309368,0.157661,0.192109,0.143611,0.209393,-0.421726,-0.428806,-0.521295,0.446096,-0.042375,-0.036272,0.796716,-0.186553,-0.103263,13950.0
3,1.309368,0.091417,0.192109,0.238717,0.209393,0.517350,0.219445,-0.521295,0.446096,-0.540611,0.310807,0.796716,-1.119318,-1.275042,17450.0
4,0.000000,0.157661,0.248803,0.191164,-0.284750,-0.093917,0.219445,-0.521295,0.446096,-0.416052,0.177315,0.796716,-0.963857,-0.835625,15250.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,-0.841737,1.697840,1.180193,1.427549,0.703535,0.764171,0.339492,1.675590,-0.349634,-0.166934,0.284108,0.588052,-0.342014,-0.396208,16845.0
200,-0.841737,1.697840,1.180193,1.379996,0.703535,0.951215,0.339492,1.675590,-0.349634,-0.366228,1.512233,0.379388,-0.963857,-0.835625,19045.0
201,-0.841737,1.697840,1.180193,1.427549,0.703535,0.879868,1.107790,0.930883,-1.240852,-0.341316,0.818076,0.796716,-1.119318,-1.128569,21485.0
202,-0.841737,1.697840,1.180193,1.427549,0.703535,1.275168,0.435529,-1.191531,0.446096,3.196158,0.070521,-0.663930,0.124369,-0.542680,22470.0


# Univariate Model

In [12]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold

def knn_train_test(training_feature, target, df):
    np.random.seed(1)
    
    # Randomizing the dataframe first
    # Randomize order of rows in data frame.
    shuffled_index = np.random.permutation(df.index)
    shuffled_df = df.reindex(shuffled_index)
    
    # Instantiating the model
    knn = KNeighborsRegressor()
    
    # Splitting into train and test data sets
    # we choose 50-50
    splitter = int(len(df) * 0.5)
    train_df = shuffled_df.iloc[0:splitter]
    test_df = shuffled_df.iloc[splitter:]
    
    # training and predicting with the model
    
    # Fitting or training the model
    train_df = train_df.reset_index()
    knn.fit(train_df[[training_feature]], train_df[target])
    
    # Predicting with the model
    predictions = knn.predict(test_df[[training_feature]])
    
    # Finding and returning the rmse of the model
    mse = mean_squared_error(test_df[target], predictions)
    return np.sqrt(mse)
    
    

# Extracting all the feature columns except the target    
feature_columns = cars_numeric_normalized.columns
feature_columns = feature_columns.drop('price')

# For storing the rmse values in a dictionary
rmse_results = {}

# choosing the columns as features and finding the rmse of the columns
# and storing the values in a dictionary
for column in feature_columns:
    rmse_val = knn_train_test(column,'price', cars_numeric_normalized)
    rmse_results[column] = rmse_val
    

# converting the values to a pandas series for ease of use
rmse_result_series = pd.Series(rmse_results)

# Sorting the series
rmse_result_series.sort_values(ascending=True)    

ModuleNotFoundError: No module named 'sklearn'

Using the Default k value(k=5) the column "engine-size" performed the best

In [None]:
# Modifying the function to use the k value for nearest neighbor algorithm
def knn_train_test_mod(training_feature, target, df, k_values):
    np.random.seed(1)
    
    # Randomizing the dataframe first
    # Randomize order of rows in data frame.
    shuffled_index = np.random.permutation(df.index)
    shuffled_df = df.reindex(shuffled_index)
    
    k_value_rmses = {}
    
    for k in k_values:
        # Instantiating the model
        knn = KNeighborsRegressor(n_neighbors=k)

        # Splitting into train and test data sets
        # we choose 50-50
        splitter = int(len(df) * 0.5)
        train_df = shuffled_df.iloc[0:splitter]
        test_df = shuffled_df.iloc[splitter:]

        # training and predicting with the model

        # Fitting or training the model
        train_df = train_df.reset_index()
        knn.fit(train_df[[training_feature]], train_df[target])

        # Predicting with the model
        predictions = knn.predict(test_df[[training_feature]])

        # Finding and returning the rmse of the model
        mse = mean_squared_error(test_df[target], predictions)
        
        # Storing the rmse value for the corresponding k value
        k_value_rmses[k] = np.sqrt(mse)
        
    return k_value_rmses

k_values = [1, 3, 5, 7, 9]
k_rmse_results = {}

# choosing the columns as features and finding the rmse of the columns for the k_fold values
# and storing the values in a dictionary
for column in feature_columns:
    k_rmse_val = knn_train_test_mod(column,'price', cars_numeric_normalized, k_values)
    k_rmse_results[column] = k_rmse_val
    
k_rmse_results

# Plotting

In [None]:
# Now we plot the values of the k_rmses
import matplotlib.pyplot as plt
%matplotlib inline

for key, value in k_rmse_results.items():
    x = list(value.keys())
    y = list(value.values())
    plt.scatter(x,y)
    plt.xlabel('k fold')
    plt.ylabel('RMSE')
    

# Multivariate Analysis
## We modify the function in this section to work with multivariate data

In [None]:
def knn_train_test_mod_multi(training_features, target, df, k_values):
    np.random.seed(1)
    
    # Randomizing the dataframe first
    # Randomize order of rows in data frame.
    shuffled_index = np.random.permutation(df.index)
    shuffled_df = df.reindex(shuffled_index)
    
    k_value_rmses = {}
    
    for k in k_values:
        # Instantiating the model
        knn = KNeighborsRegressor(n_neighbors=k)

        # Splitting into train and test data sets
        # we choose 50-50
        splitter = int(len(df) * 0.5)
        train_df = shuffled_df.iloc[0:splitter]
        test_df = shuffled_df.iloc[splitter:]

        # training and predicting with the model

        # Fitting or training the model
        train_df = train_df.reset_index()
        knn.fit(train_df[training_features], train_df[target])

        # Predicting with the model
        predictions = knn.predict(test_df[training_features])

        # Finding and returning the rmse of the model
        mse = mean_squared_error(test_df[target], predictions)
        
        # Storing the rmse value for the corresponding k value
        k_value_rmses[k] = np.sqrt(mse)
        
    return k_value_rmses


In [None]:
# we find the best features
# we take the average of all the RMSE of each of the features, 
# create a series and sort the series and store the series


feature_wise_rmse = {}
for feature, value in k_rmse_results.items():
    avg_rmse = np.mean(list(value.values()))
    feature_wise_rmse[feature] = avg_rmse
    
feature_wise_avg_series = pd.Series((feature_wise_rmse))
sorted_feature_avg_rmse = feature_wise_avg_series.sort_values(ascending=True)
sorted_feature_names = sorted_feature_avg_rmse.index

In [None]:
k_rmse_results = {}
k_values = [5]

# we find the rmse value by using 2,3,4,5 best features we have trained the model
# on
for i in range(2,6):
    rmse = knn_train_test_mod_multi(list(sorted_feature_names[:i]),
                                    'price',
                                    cars_numeric_normalized, 
                                    k_values)
    k_rmse_results[i] = rmse
    
k_rmse_results

# Hyperparameter Tuning

In [None]:
# First we find the top three models according to rmse

top_three_models = {} # key will be the number of features and value will be rmse

for key, values in k_rmse_results.items():
    top_three_models[key] = values[k_values[0]]

top_three_models_series = pd.Series(top_three_models)
top_three_models_series = top_three_models_series.sort_values(ascending=True)[:3]
top_three_models_series

We see that the top three models are the ones having 2, 5 and 4 features

In [None]:
# we change the k_values first, from 1 to 25
k_values = [x for x in range(1,26)]

k_rmse_results_final= {}
# we find the rmse value by using 2, 5, 4 best features we have trained the model
# on for a range of k values

rmse_2_feature = knn_train_test_mod_multi(list(sorted_feature_names[:2]),
                                    'price',
                                    cars_numeric_normalized, 
                                    k_values)

rmse_5_feature = knn_train_test_mod_multi(list(sorted_feature_names[:5]),
                                    'price',
                                    cars_numeric_normalized, 
                                    k_values)

rmse_4_feature = knn_train_test_mod_multi(list(sorted_feature_names[:4]),
                                    'price',
                                    cars_numeric_normalized, 
                                    k_values)

k_rmse_results_final[2] = rmse_2_feature
k_rmse_results_final[5] = rmse_5_feature
k_rmse_results_final[4] = rmse_4_feature

k_rmse_results_final

# Plotting the rmse values

So far, we figured out the rmse values for 1-25 values of k for the top three models. Now we plot the values and figure out which k value is optimal for which model.

In [None]:
for key, value in k_rmse_results_final.items():
    x = list(value.keys())
    y = list(value.values())  
    plt.plot(x,y, label="{}".format(key))
    
plt.xlabel('k value')
plt.ylabel('RMSE')
plt.legend()

So, from the plot we can see:<br />
for **2 best features**, the optimal value of **k = 4**

for **4 best features**, the optimal value of **k = 2**

for **5 best features**, the optimal value of **k = 1**

And from all the models, we can infer that with **5 best features** and **k = 1**, we can get the lowest rmse score.


# Modifying the model to use k-fold cross validation

In [None]:
def knn_train_test_mod_multi_kfold(training_features, target, df, k_values, fold_value):
    np.random.seed(1)
    
    # Randomizing the dataframe first
    # Randomize order of rows in data frame.
    shuffled_index = np.random.permutation(df.index)
    shuffled_df = df.reindex(shuffled_index)
    
    k_value_rmses = {}
    
    for k in k_values:
        # Instantiating the model
        knn = KNeighborsRegressor(n_neighbors=k)
        
        # Instantiating the Kfold Object
        # default value set to 5
        kf = KFold(fold_value, shuffle=True, random_state=1)

        mses = cross_val_score(knn, shuffled_df[training_features], shuffled_df[target],
                              scoring="neg_mean_squared_error", cv=kf)
        
        # Storing the rmse value for the corresponding k value
        k_value_rmses[k] = avg_rmse = np.sqrt(np.absolute(mses)).mean()
        
    return k_value_rmses


In [None]:
# we change the k_values first, from 1 to 25
k_values = [x for x in range(1,26)]

k_rmse_results_final= {}
# we find the rmse value by using 2, 5, 4 best features we have trained the model
# on for a range of k values

rmse_2_feature = knn_train_test_mod_multi_kfold(list(sorted_feature_names[:2]),
                                    'price',
                                    cars_numeric_normalized, 
                                    k_values, 5)

rmse_5_feature = knn_train_test_mod_multi_kfold(list(sorted_feature_names[:5]),
                                    'price',
                                    cars_numeric_normalized, 
                                    k_values, 5)

rmse_4_feature = knn_train_test_mod_multi_kfold(list(sorted_feature_names[:4]),
                                    'price',
                                    cars_numeric_normalized, 
                                    k_values, 5)

k_rmse_results_final[2] = rmse_2_feature
k_rmse_results_final[5] = rmse_5_feature
k_rmse_results_final[4] = rmse_4_feature

k_rmse_results_final

As We can see that using k(5) folds cross validation the mse score did not improve that much. We can try with more different k values to see if there is any change or improvement. We would only do this for k=5 for KNN.

In [None]:
# we change the k_values first, from 1 to 25
k_values = [x for x in range(1,26)]

k_rmse_results_final= {}
# we find the rmse value by using 2, 5, 4 best features we have trained the model
# on for a range of k values


rmse_5_feature = knn_train_test_mod_multi_kfold(list(sorted_feature_names[:5]),
                                    'price',
                                    cars_numeric_normalized, 
                                    k_values, 4)

k_rmse_results_final[5] = rmse_5_feature


k_rmse_results_final

So there is no significant improvement in the mse.