---
__Project: Predicting Car Prices with kNN__

---

In this guided project we'll be using the k-nearest neighbors algorithm. 

We'll demonstrate the machine learning workflow reuqired to predict a car's market price using its attributes. 

The data set we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates etc. 

download it directly from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data)

In [1]:
# Imports
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

pd.options.display.max_columns = 88

In [2]:
# Raw data
with open('imports-85.data', 'r') as f:
    df_raw = pd.read_csv(f, header = None)

df_raw.columns = [
    'symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
    'num-of-doors', 'body-style','drive-wheels', 'engine-location', 'wheel-base', 
    'length', 'width', 'height', 'curb-weight', 'engine-type','num-of-cylinders', 
    'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 
    'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']    

df_raw = df_raw.replace('?', float('nan'))
for col in df_raw:
    try:
        df_raw[col].fillna(df_raw[col].mean())
    except TypeError:
        pass
    else:
        df_raw[col] = df_raw[col].fillna(df_raw[col].mean())
df_raw.sample(10)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
123,-1,74.0,plymouth,gas,std,four,wagon,fwd,front,103.3,174.6,64.6,59.8,2535,ohc,four,122,2bbl,3.35,3.46,8.5,88,5000,24,30,8921
155,0,91.0,toyota,gas,std,four,wagon,4wd,front,95.7,169.7,63.6,59.1,3110,ohc,four,92,2bbl,3.05,3.03,9.0,62,4800,27,32,8778
191,0,,volkswagen,gas,std,four,sedan,fwd,front,100.4,180.2,66.9,55.1,2661,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,24,13295
98,2,168.0,nissan,gas,std,two,hardtop,fwd,front,95.1,162.4,63.8,53.3,2008,ohc,four,97,2bbl,3.15,3.29,9.4,69,5200,31,37,8249
135,2,104.0,saab,gas,std,four,sedan,fwd,front,99.1,186.6,66.5,56.1,2758,ohc,four,121,mpfi,3.54,3.07,9.3,110,5250,21,28,15510
63,0,,mazda,diesel,std,,sedan,fwd,front,98.8,177.8,66.5,55.5,2443,ohc,four,122,idi,3.39,3.39,22.7,64,4650,36,42,10795
13,0,188.0,bmw,gas,std,four,sedan,rwd,front,101.2,176.8,64.8,54.3,2765,ohc,six,164,mpfi,3.31,3.19,9.0,121,4250,21,28,21105
6,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
18,2,121.0,chevrolet,gas,std,two,hatchback,fwd,front,88.4,141.1,60.3,53.2,1488,l,three,61,2bbl,2.91,3.03,9.5,48,5100,47,53,5151
133,2,104.0,saab,gas,std,four,sedan,fwd,front,99.1,186.6,66.5,56.1,2695,ohc,four,121,mpfi,3.54,3.07,9.3,110,5250,21,28,12170


In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    164 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         203 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 201 non-null object
stroke               201 non-null object
compression-rate     205 non-null float64
horsepower           203 non-nul

In [4]:
# Extract Target & Features
df_raw = df_raw.dropna()
feature_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 
                'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 
                'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 
                'highway-mpg']
target_col = ['price']

df_features = df_raw[feature_cols].copy().astype(float)
df_target = df_raw[target_col].copy().astype(float)

# Normalise 
df_features = (df_features - df_features.min()) / (df_features.max() - df_features.min())

# Combine
df_features['price'] = df_target['price']

df_features.describe()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0
mean,0.293885,0.402212,0.509168,0.465574,0.432632,0.377478,0.295566,0.542947,0.555406,0.197571,0.314714,0.393403,0.338883,0.39116,11445.72956
std,0.186656,0.178187,0.187369,0.170867,0.21815,0.186944,0.154623,0.190954,0.140423,0.243092,0.202096,0.190104,0.179328,0.179422,5877.856195
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5118.0
25%,0.151832,0.272414,0.399187,0.324561,0.274038,0.224011,0.182741,0.364286,0.492857,0.10625,0.138158,0.265306,0.235294,0.277778,7372.0
50%,0.251309,0.355172,0.508943,0.447368,0.451923,0.330489,0.248731,0.521429,0.571429,0.125,0.263158,0.428571,0.323529,0.388889,9233.0
75%,0.434555,0.489655,0.596748,0.54386,0.586538,0.512607,0.375635,0.728571,0.638095,0.15,0.434211,0.55102,0.470588,0.527778,14719.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35056.0


In [5]:
# Develop kNN model function
def knn_train_test(train_col, target_col, df, test_pct):
    np.random.seed(1)
    # split data
    train_test_split = np.random.permutation(df.index)
    df = df.reindex(train_test_split)
    test_fraction = test_pct
    split_point = int(test_fraction * len(df.index))
    
    df_train = df.iloc[:split_point]
    df_test = df.iloc[split_point:]
    
    # build kNN model
    knn = KNeighborsRegressor()
    model = knn.fit(df_train[[train_col]], df_train[target_col])
    pred = model.predict(df_test[[train_col]])
    rmse = mean_squared_error(df_test[target_col], pred)**(1/2)
    
    return rmse

In [6]:
# Run with default values on single columns
univar_rmse = {}
for i in df_features.columns[:-1]:
    err = knn_train_test(i, 'price', df_features, 0.5)
    univar_rmse[i] = err

pd.Series(univar_rmse).round().sort_values(ascending = False)

peak-rpm             6130.0
compression-rate     5561.0
stroke               5446.0
normalized-losses    5268.0
height               4874.0
bore                 4177.0
wheel-base           3408.0
engine-size          3398.0
city-mpg             3300.0
length               3262.0
horsepower           2956.0
highway-mpg          2745.0
width                2549.0
curb-weight          2356.0
dtype: float64

In [7]:
# Modified for variable k value
def knn_train_test(train_col, target_col, df, test_pct, k_val):
    np.random.seed(1)
    # split data
    train_test_split = np.random.permutation(df.index)
    df = df.reindex(train_test_split)
    test_fraction = test_pct
    split_point = int(test_fraction * len(df.index))
    
    df_train = df.iloc[:split_point]
    df_test = df.iloc[split_point:]
    
    # build kNN model
    knn = KNeighborsRegressor(n_neighbors = k_val)
    model = knn.fit(df_train[[train_col]], df_train[target_col])
    pred = model.predict(df_test[[train_col]])
    rmse = mean_squared_error(df_test[target_col], pred)**(1/2)
    
    return rmse

In [8]:
# Run with default values on single columns
univar_rmse = {}

for i in df_features.columns[:-1]:
    k_list = []
    for k in [1,3,5,7,9]:
        err = knn_train_test(i, 'price', df_features, 0.5, k)
        k_list.append(err)
    univar_rmse[i] = k_list

df_rmse = pd.DataFrame(univar_rmse)
df_rmse.round()

Unnamed: 0,bore,city-mpg,compression-rate,curb-weight,engine-size,height,highway-mpg,horsepower,length,normalized-losses,peak-rpm,stroke,wheel-base,width
0,5110.0,4764.0,5493.0,3033.0,2691.0,6470.0,3615.0,3310.0,3668.0,4776.0,6896.0,5317.0,3449.0,3305.0
1,4277.0,3390.0,5287.0,2333.0,2491.0,5145.0,3065.0,3244.0,3106.0,4335.0,6598.0,5248.0,3305.0,2586.0
2,4177.0,3300.0,5561.0,2356.0,3398.0,4874.0,2745.0,2956.0,3262.0,5268.0,6130.0,5446.0,3408.0,2549.0
3,4296.0,3241.0,5839.0,2384.0,3440.0,5172.0,2979.0,3470.0,3326.0,5439.0,6251.0,5346.0,3496.0,2628.0
4,4525.0,3439.0,5712.0,2466.0,3425.0,5344.0,2931.0,3708.0,3257.0,5353.0,5797.0,5441.0,3444.0,2679.0


In [9]:
df_rmse.describe()

Unnamed: 0,bore,city-mpg,compression-rate,curb-weight,engine-size,height,highway-mpg,horsepower,length,normalized-losses,peak-rpm,stroke,wheel-base,width
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,4476.980962,3626.729867,5578.468276,2514.266702,3088.976672,5401.07221,3067.057715,3337.541422,3323.904303,5034.303683,6334.266639,5359.637602,3420.404258,2749.476129
std,375.895282,640.286664,211.260767,294.226435,460.341331,620.683274,328.05142,278.51082,208.722921,467.748753,424.840791,84.752666,71.681393,314.306922
min,4177.182075,3241.085297,5286.684712,2332.545795,2491.266724,4874.469252,2744.892541,2955.574146,3106.059053,4335.49881,5797.135894,5247.635031,3304.979078,2549.357688
25%,4277.143919,3299.852015,5493.319976,2356.138456,2690.554915,5145.324175,2931.285073,3244.422588,3256.806478,4775.973842,6129.764575,5317.024261,3407.728881,2586.236395
50%,4295.614379,3389.731475,5561.272951,2384.100077,3398.270946,5171.931235,2978.705963,3309.602112,3262.288268,5267.764538,6250.766788,5345.904443,3444.354601,2627.836871
75%,4525.421619,3439.170844,5711.607043,2465.682466,3424.530634,5343.697608,3065.225999,3469.879679,3326.224482,5352.913731,6598.114978,5441.334198,3449.327708,2678.882686
max,5109.542817,4763.809704,5839.456696,3032.866713,3440.260143,6469.93878,3615.178996,3708.228583,3668.143233,5439.367495,6895.55096,5446.290077,3495.631023,3305.067004


In [10]:
# Multivariate with variable k value 
def knn_train_test(train_cols, target_col, df, test_pct, k_val):
    np.random.seed(1)
    # split data
    train_test_split = np.random.permutation(df.index)
    df = df.reindex(train_test_split)
    test_fraction = test_pct
    split_point = int(test_fraction * len(df.index))
    
    df_train = df.iloc[:split_point]
    df_test = df.iloc[split_point:]
    
    # build kNN model
    knn = KNeighborsRegressor(n_neighbors = k_val)
    model = knn.fit(df_train[[train_cols]], df_train[target_col])
    pred = model.predict(df_test[[train_cols]])
    rmse = mean_squared_error(df_test[target_col], pred)**(1/2)
    
    return rmse

In [11]:
# Use the best n features to train a model using default k
best_features = [i for i in df_rmse.mean().round().sort_values().keys()]
univar_rmse = {}

for n, f in enumerate(best_features):
    for i in df_features[best_features[:n]].columns:
        k_list = []
        for k in [1,3,5,7,9]:
            df = pd.concat([df_features[best_features[:n]], df_features['price']], axis = 1)
            err = knn_train_test(i, 'price', df, 0.5, k)
            k_list.append(round(err))
        univar_rmse[n] = [k_list]

df_errs = pd.DataFrame(data = [i for i in univar_rmse.values()], columns = ['RMSE'])
df_errs

Unnamed: 0,RMSE
0,"[3033.0, 2333.0, 2356.0, 2384.0, 2466.0]"
1,"[3305.0, 2586.0, 2549.0, 2628.0, 2679.0]"
2,"[3615.0, 3065.0, 2745.0, 2979.0, 2931.0]"
3,"[2691.0, 2491.0, 3398.0, 3440.0, 3425.0]"
4,"[3668.0, 3106.0, 3262.0, 3326.0, 3257.0]"
5,"[3310.0, 3244.0, 2956.0, 3470.0, 3708.0]"
6,"[3449.0, 3305.0, 3408.0, 3496.0, 3444.0]"
7,"[4764.0, 3390.0, 3300.0, 3241.0, 3439.0]"
8,"[5110.0, 4277.0, 4177.0, 4296.0, 4525.0]"
9,"[4776.0, 4335.0, 5268.0, 5439.0, 5353.0]"


In [12]:
# Multivariate with variable k value 
def knn_train_test(train_cols, target_col, df, test_pct, k_val):
    np.random.seed(1)
    # split data
    train_test_split = np.random.permutation(df.index)
    df = df.reindex(train_test_split)
    test_fraction = test_pct
    split_point = int(test_fraction * len(df.index))
    
    df_train = df.iloc[:split_point]
    df_test = df.iloc[split_point:]
    
    # build kNN model
    knn = KNeighborsRegressor(n_neighbors = k_val)
    model = knn.fit(df_train[[train_cols]], df_train[target_col])
    pred = model.predict(df_test[[train_cols]])
    rmse = mean_squared_error(df_test[target_col], pred)**(1/2)
    
    return rmse

In [21]:
# Use the best n features to train a model using default k
best_features = [i for i in df_rmse.mean().round().sort_values().keys()]
univar_rmse = {}

for n, f in enumerate(best_features):
    for i in df_features[best_features[:n]].columns:
        k_list = []
        for k in range(1, 26):
            df = pd.concat([df_features[best_features[:n]], df_features['price']], axis = 1)
            err = knn_train_test(i, 'price', df, 0.5, k)
            k_list.append(round(err))
        univar_rmse[n] = [k_list]

df_errs = pd.DataFrame(data = [i for i in univar_rmse.values()], columns = ['RMSE'])
df_errs

Unnamed: 0,RMSE
0,"[3033.0, 2498.0, 2333.0, 2356.0, 2356.0, 2416...."
1,"[3305.0, 2745.0, 2586.0, 2668.0, 2549.0, 2573...."
2,"[3615.0, 2890.0, 3065.0, 2652.0, 2745.0, 2812...."
3,"[2691.0, 2063.0, 2491.0, 3009.0, 3398.0, 3554...."
4,"[3668.0, 3325.0, 3106.0, 3046.0, 3262.0, 3405...."
5,"[3310.0, 4129.0, 3244.0, 3053.0, 2956.0, 3162...."
6,"[3449.0, 3933.0, 3305.0, 3704.0, 3408.0, 3451...."
7,"[4764.0, 3693.0, 3390.0, 3443.0, 3300.0, 3191...."
8,"[5110.0, 4580.0, 4277.0, 4176.0, 4177.0, 4252...."
9,"[4776.0, 4069.0, 4335.0, 4915.0, 5268.0, 5485...."
