In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = 99

# For info about the data 
# https://archive.ics.uci.edu/ml/datasets/automobile

cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars = pd.read_csv('imports-85.csv',names=cols)

# select cols with numeric values
numeric_cols=['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
numeric_cars=cars[numeric_cols]
numeric_cars.head(5)

#lets take care of missing values, replace any ? by nan
numeric_cars=numeric_cars.replace('?', np.nan)

#convert to float
numeric_cars=numeric_cars.astype('float')

# missing values
numeric_cars.isnull().sum()

#lets remove rows that have price missing
numeric_cars=numeric_cars.dropna(subset=['price'])

#fill the rest with mean
numeric_cars=numeric_cars.fillna(numeric_cars.mean())

#check again to see no missing values are lurking in the df
numeric_cars.isnull().sum()

# lets normalize data
price_col=numeric_cars['price']
numeric_cars=(numeric_cars-numeric_cars.min())/(numeric_cars.max()-numeric_cars.min())
numeric_cars['price']=price_col


In [2]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

def knn_train_test(train_col, target_col, df):
    knn=KNeighborsRegressor()
    np.random.seed(1)
    
    shuffled_index=np.random.permutation(df.index)
    rand_df= df.reindex(shuffled_index)
    
    mid_df= int(len(rand_df)/2)
    train_df=rand_df.iloc[0: mid_df]
    test_df=rand_df.iloc[mid_df:]
    
    #fit Knn using default k values
    knn.fit(train_df[[train_col]], train_df[target_col])
    
    #predict
    predicted_labels= knn.predict(test_df[[train_col]])
    
    #RMSE
    mse = mean_squared_error(test_df[target_col],predicted_labels)
    rmse=np.sqrt(mse)
    return rmse

rmse_result={}
train_cols=numeric_cars.columns.drop('price')

# check for error

#for col in train_cols:
   # rmse_val = knn_train_test(col,'price', numeric_cars)
   # rmse_results[col] = rmse_val

# Create a Series object from the dictionary so 
# we can easily view the results, sort, etc
rmse_results_series = pd.Series(rmse_results)
rmse_results_series.sort_values()



NameError: name 'rmse_results' is not defined

In [None]:
#result 

#horsepower           4007.472352
#curb-weight          4437.934395
#highway-mpg          4560.327728
#width                4644.988277
#city-mpg             4772.387141
#length               5380.923534
#wheel-base           5527.700337
#compression-rate     6736.676353
#bore                 6816.853712
#height               7487.652519
#peak-rpm             7573.370090
#normalized-losses    7697.134523
#stroke               8078.491289


In [3]:
def knn_train_test(train_col, target_col, df):
    np.random.seed(1)
    
    shuffled_index=np.random.permutation(df.index)
    rand_df= df.reindex(shuffled_index)
    
    mid_df= int(len(rand_df)/2)
    train_df=rand_df.iloc[0: mid_df]
    test_df=rand_df.iloc[mid_df:]
    
    k_values=[1,3,5,7,9]
    k_rmses={}
    for k in k_values:
        knn=KNeighborsRegressor(n_neighbors=k)
        knn.fit(train_df[[train_col]], train_df[target_col])
        #predict
        predicted_labels= knn.predict(test_df[[train_col]])
        #RMSE
        mse = mean_squared_error(test_df[target_col],predicted_labels)
        rmse=np.sqrt(mse)
        k_rmses[k] = rmse
    return k_rmses

k_rmse_results={}

train_cols=numeric_cars.columns.drop('price')

for col in train_cols:
    rmse_val = knn_train_test(col,'price', numeric_cars)
    k_rmse_results[col] = rmse_val

k_rmse_results


ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

for k,v in k_rmse_results.items():
    x = list(v.keys())
    y = list(v.values())
    
    plt.plot(x,y)
    plt.xlabel('k value')
    plt.ylabel('RMSE')
    plt.show()

In [5]:
feature_avg_rmse={}

for k,v in k_rmse_results.items():
    avg_rmse=np.mean(list(v.values()))
    feature_avg_rmse[k]=avg_rmse
series_avg_rmse = pd.Series(feature_avg_rmse)
series_avg_rmse.sort_values()

    
    

Series([], dtype: float64)

In [6]:
def knn_train_test(train_cols, target_col, df):
    np.random.seed(1)
    
    shuffled_index=np.random.permutation(df.index)
    rand_df= df.reindex(shuffled_index)
    
    mid_df= int(len(rand_df)/2)
    
    train_df=rand_df.iloc[0: mid_df]
    test_df=rand_df.iloc[mid_df:]
    
    k_values=[5]
    k_rmses={}
    
    for k in k_values:
        knn=KNeighborsRegressor(n_neighbors=k)
        knn.fit(train_df[[train_cols]], train_df[target_col])
        #predict
        predicted_labels= knn.predict(test_df[train_cols])
        #RMSE
        mse = mean_squared_error(test_df[target_col],predicted_labels)
        rmse=np.sqrt(mse)
        k_rmses[k] = rmse
    return k_rmses

k_rmse_results={}

two_best_features=['horsepower','width']
rmse_val= knn_train_test(two_best_features,'price',numeric_cars)
k_rmse_results["two best features"]= rmse_val

three_best_features = ['horsepower', 'width', 'curb-weight']
rmse_val = knn_train_test(three_best_features, 'price', numeric_cars)
k_rmse_results["three best features"] = rmse_val

four_best_features = ['horsepower', 'width', 'curb-weight', 'city-mpg']
rmse_val = knn_train_test(four_best_features, 'price', numeric_cars)
k_rmse_results["four best features"] = rmse_val

five_best_features = ['horsepower', 'width', 'curb-weight' , 'city-mpg' , 'highway-mpg']
rmse_val = knn_train_test(five_best_features, 'price', numeric_cars)
k_rmse_results["five best features"] = rmse_val

six_best_features = ['horsepower', 'width', 'curb-weight' , 'city-mpg' , 'highway-mpg', 'length']
rmse_val = knn_train_test(six_best_features, 'price', numeric_cars)
k_rmse_results["six best features"] = rmse_val

k_rmse_results

#{'five best features': {5: 3346.6737097607775},
 #'four best features': {5: 3232.1036292326721},
 #'six best features': {5: 3398.1290113563641},
 #'three best features': {5: 3212.5596306057919},
 #'two best features': {5: 3681.3980922556266}}


TypeError: unhashable type: 'list'

In [7]:
#For the top 3 models in the last step, vary the hyperparameter 
#value from 1 to 25 and plot the resulting RMSE values


def knn_train_test(train_cols, target_col, df):
    np.random.seed(1)
    
    shuffled_index=np.random.permutation(df.index)
    rand_df= df.reindex(shuffled_index)
    
    mid_df= int(len(rand_df)/2)
    
    train_df=rand_df.iloc[0: mid_df]
    test_df=rand_df.iloc[mid_df:]
    
    k_values=[x for x in range(1,25)]
    k_rmses={}
    
    for k in k_values:
        knn=KNeighborsRegressor(n_neighbors=k)
        knn.fit(train_df[[train_cols]], train_df[target_col])
        #predict
        predicted_labels= knn.predict(test_df[train_cols])
        #RMSE
        mse = mean_squared_error(test_df[target_col],predicted_labels)
        rmse=np.sqrt(mse)
        k_rmses[k] = rmse
    return k_rmses

k_rmse_results={}

three_best_features = ['horsepower', 'width', 'curb-weight']
rmse_val = knn_train_test(three_best_features, 'price', numeric_cars)
k_rmse_results["three best features"] = rmse_val

four_best_features = ['horsepower', 'width', 'curb-weight', 'city-mpg']
rmse_val = knn_train_test(four_best_features, 'price', numeric_cars)
k_rmse_results["four best features"] = rmse_val

five_best_features = ['horsepower', 'width', 'curb-weight' , 'city-mpg' , 'highway-mpg']
rmse_val = knn_train_test(five_best_features, 'price', numeric_cars)
k_rmse_results["five best features"] = rmse_val

k_rmse_results

TypeError: unhashable type: 'list'

In [8]:
for k,v in k_rmse_results.items():
    x = list(v.keys())
    y = list(v.values())
    
    plt.plot(x,y)
    plt.xlabel('k value')
    plt.ylabel('RMSE')

In [9]:
# lets repeat using K-fold cross-validation
#len in 201, so split into 5 folds

import pandas as pd
import numpy as np

pd.options.display.max_columns = 99

# For info about the data 
# https://archive.ics.uci.edu/ml/datasets/automobile

cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars = pd.read_csv('imports-85.csv',names=cols)

# select cols with numeric values
numeric_cols=['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
numeric_cars=cars[numeric_cols]
numeric_cars.head(5)

#lets take care of missing values, replace any ? by nan
numeric_cars=numeric_cars.replace('?', np.nan)

#convert to float
numeric_cars=numeric_cars.astype('float')

# missing values
numeric_cars.isnull().sum()

#lets remove rows that have price missing
numeric_cars=numeric_cars.dropna(subset=['price'])

#fill the rest with mean
numeric_cars=numeric_cars.fillna(numeric_cars.mean())

#check again to see no missing values are lurking in the df
numeric_cars.isnull().sum()

# lets normalize data
price_col=numeric_cars['price']
numeric_cars=(numeric_cars-numeric_cars.mean())/(numeric_cars-numeric_cars.std())
numeric_cars['price']=price_col

# lets repeat using K-fold cross-validation


      

In [10]:
print(len(price_col))  

# something got fucked up


0


In [11]:
# lets repeat using K-fold cross-validation on 4 best features

four_best_features = ['horsepower', 'width', 'curb-weight', 'city-mpg']

from sklearn.model_selection import cross_val_score, KFold
kf = KFold(5, shuffle=True, random_state=1)
model = KNeighborsRegressor()

mses=cross_val_score(model,numeric_cars[[four_best_features]],
numeric_cars['price'],scoring="neg_mean_squared_error", cv=kf)

rmses = np.sqrt(np.absolute(mses))
avg_rmse = np.mean(rmses)
std_rmse

print(rmses)
print(avg_rmse)

        

TypeError: unhashable type: 'list'