# Machine Learning Fundamentals: Predicting Car Prices

In [115]:
import pandas as pd
import numpy as np
cars = pd.read_csv("imports-85.data")
print(cars.columns)

Index(['3', '?', 'alfa-romero', 'gas', 'std', 'two', 'convertible', 'rwd',
       'front', '88.60', '168.80', '64.10', '48.80', '2548', 'dohc', 'four',
       '130', 'mpfi', '3.47', '2.68', '9.00', '111', '5000', '21', '27',
       '13495'],
      dtype='object')


Looks like our data do not contain headers names, so we have to manually add them.

In [116]:
col_names = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv("imports-85.data",names=col_names)
print(cars.head(2))

   symboling normalized-losses         make fuel-type aspiration num-of-doors  \
0          3                 ?  alfa-romero       gas        std          two   
1          3                 ?  alfa-romero       gas        std          two   

    body-style drive-wheels engine-location  wheel-base  ...    engine-size  \
0  convertible          rwd           front        88.6  ...            130   
1  convertible          rwd           front        88.6  ...            130   

   fuel-system  bore  stroke compression-rate horsepower  peak-rpm city-mpg  \
0         mpfi  3.47    2.68              9.0        111      5000       21   
1         mpfi  3.47    2.68              9.0        111      5000       21   

  highway-mpg  price  
0          27  13495  
1          27  16500  

[2 rows x 26 columns]


## Cleaning Data

In [117]:
cars = cars.replace(to_replace="?",value=np.nan)
print(cars["normalized-losses"])

0      NaN
1      NaN
2      NaN
3      164
4      164
5      NaN
6      158
7      NaN
8      158
9      NaN
10     192
11     192
12     188
13     188
14     NaN
15     NaN
16     NaN
17     NaN
18     121
19      98
20      81
21     118
22     118
23     118
24     148
25     148
26     148
27     148
28     110
29     145
      ... 
175     65
176     65
177     65
178    197
179    197
180     90
181    NaN
182    122
183    122
184     94
185     94
186     94
187     94
188     94
189    NaN
190    256
191    NaN
192    NaN
193    NaN
194    103
195     74
196    103
197     74
198    103
199     74
200     95
201     95
202     95
203     95
204     95
Name: normalized-losses, dtype: object


Because we will be calculating Euclidean distance for K-nearest neighbor (KNN), we will have to use only continuous variables, omitting any categorical variables.

In [118]:
cars_new = cars[['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']]
print(cars_new.info())
print(cars_new.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 14 columns):
normalized-losses    164 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
bore                 201 non-null object
stroke               201 non-null object
compression-rate     205 non-null float64
horsepower           203 non-null object
peak-rpm             203 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                201 non-null object
dtypes: float64(5), int64(3), object(6)
memory usage: 22.5+ KB
None
normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm   

We know that KNN will throw an error if we do not omit any missing data, we need to either remove rows with missing data or fill them in with some educated guesses e.g. mean of the column. 

While we can just omit row with missing data for bore, stroke, horsepower, peak-rpm, and price, we will have to fill in the numbers for normalized losses as the number of rows are significant.

In [120]:
cars_new = cars_new.astype(dtype="float64")
print(cars_new.info())
print(cars_new.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 14 columns):
normalized-losses    164 non-null float64
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null float64
bore                 201 non-null float64
stroke               201 non-null float64
compression-rate     205 non-null float64
horsepower           203 non-null float64
peak-rpm             203 non-null float64
city-mpg             205 non-null float64
highway-mpg          205 non-null float64
price                201 non-null float64
dtypes: float64(14)
memory usage: 22.5 KB
None
normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm            

In [121]:
cars_new = cars_new.astype(dtype="float64")
norm_loss_mean = np.mean(cars_new[cars_new["normalized-losses"].notnull()]["normalized-losses"])  

for i,row in cars_new.iterrows():
    if np.isnan(row["normalized-losses"]):
        row["normalized-losses"] = norm_loss_mean

print(cars_new["normalized-losses"])                      

0      122.0
1      122.0
2      122.0
3      164.0
4      164.0
5      122.0
6      158.0
7      122.0
8      158.0
9      122.0
10     192.0
11     192.0
12     188.0
13     188.0
14     122.0
15     122.0
16     122.0
17     122.0
18     121.0
19      98.0
20      81.0
21     118.0
22     118.0
23     118.0
24     148.0
25     148.0
26     148.0
27     148.0
28     110.0
29     145.0
       ...  
175     65.0
176     65.0
177     65.0
178    197.0
179    197.0
180     90.0
181    122.0
182    122.0
183    122.0
184     94.0
185     94.0
186     94.0
187     94.0
188     94.0
189    122.0
190    256.0
191    122.0
192    122.0
193    122.0
194    103.0
195     74.0
196    103.0
197     74.0
198    103.0
199     74.0
200     95.0
201     95.0
202     95.0
203     95.0
204     95.0
Name: normalized-losses, dtype: float64


To normalise all our numeric values to be between 0 and 1, we can find the z-score for all column values.

In [122]:
cars_normalised = (cars_new - cars_new.min()) / (cars_new.max() - cars_new.min())
cars_normalised = cars_normalised.drop("price",axis=1)
cars_normalised["price"] = cars_new["price"]
cars_normalised = cars_normalised.dropna() 
print(cars_normalised.head(2))

   normalized-losses  wheel-base    length     width    height  curb-weight  \
0           0.298429    0.058309  0.413433  0.316667  0.083333     0.411171   
1           0.298429    0.058309  0.413433  0.316667  0.083333     0.411171   

       bore    stroke  compression-rate  horsepower  peak-rpm  city-mpg  \
0  0.664286  0.290476             0.125      0.2625  0.346939  0.222222   
1  0.664286  0.290476             0.125      0.2625  0.346939  0.222222   

   highway-mpg    price  
0     0.289474  13495.0  
1     0.289474  16500.0  


## K-Nearest Neigbor Function

In [127]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

def knn_train_test(training_col,target_col,df,k):
    
    np.random.seed(1)
    index = np.random.permutation(df.index)
    cars_shuffled = df.reindex(index)
    
    half = int(len(cars_shuffled)/2)
    training_set = cars_shuffled[0:half]
    test_set = cars_shuffled[half:]
    
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(training_set[[training_col]],training_set[target_col])
    actual = test_set[target_col]
    prediction = knn.predict(test_set[[training_col]])
    mse = mean_squared_error(actual,prediction)
    return np.sqrt(mse)

numeric_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']

rmses = {}
for i in numeric_cols:
    result = knn_train_test(i,"price",cars_normalised,5)
    rmses[i] = result

rmses_series = pd.Series(rmses)
rmses_series.sort_values()
    

highway-mpg          4409.808046
city-mpg             4437.412782
horsepower           4568.145411
curb-weight          4594.668401
width                4941.109798
length               5210.375156
wheel-base           5692.794469
compression-rate     7020.882648
bore                 7076.775614
stroke               7217.395544
peak-rpm             7313.971818
height               7574.683028
normalized-losses    7726.404961
dtype: float64

In [134]:
import matplotlib.pyplot as plt
import collections
%matplotlib inline

k_vals = [1,3,5,7,9]
rmses_first = {}


def train_col(col,k_vals):
    for k in k_vals:
        result = knn_train_test(col,"price",cars_normalised,k)
        if col in rmses_first:
            rmses_first[col][k] = result
        else:
            rmses_first[col] = {}
            rmses_first[col][k] = result
    return rmses_first

for i in numeric_cols:
    train_col(i,k_vals)
    
print(rmses_first)
    

{'normalized-losses': {1: 7676.0580511146236, 3: 8218.2274928792613, 9: 7382.2205706372624, 5: 7726.4049614134974, 7: 7611.9648842053712}, 'width': {1: 6247.1396099432941, 3: 5418.8405320712418, 9: 4465.4104928172692, 5: 4941.1097982918282, 7: 4518.999877075079}, 'curb-weight': {1: 6739.4325375784738, 3: 5187.9040393973619, 9: 4175.9670956619611, 5: 4594.6684014391803, 7: 4125.2882939403898}, 'length': {1: 5294.1220746046574, 3: 5152.0756584800802, 9: 5345.1776009468467, 5: 5210.3751558299327, 7: 5380.4060898012467}, 'wheel-base': {1: 5718.710483302747, 3: 5718.484995916845, 9: 5840.7985926626843, 5: 5692.7944690581253, 7: 5725.9061251817602}, 'highway-mpg': {1: 5591.2869160463424, 3: 4674.1029962686844, 9: 4234.824050221162, 5: 4409.8080459404146, 7: 4269.119331662173}, 'bore': {1: 9782.2410556798568, 3: 6987.8411884541774, 9: 7654.9526899871207, 5: 7076.7756136620901, 7: 7604.1323167957144}, 'height': {1: 9144.6891295172081, 3: 8163.5394698462114, 9: 7609.841226601191, 5: 7574.683028