# Predicting Car Prices Using K-nearest Neighbors

In this project, I will use the K-nearest neighbors model to predict a car's market price using its attributes. The data set I will be working with contains information on various cars. For each car I have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates, and more. The data set can be downloaded [here](https://archive.ics.uci.edu/ml/datasets/automobile)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,KFold
%matplotlib inline

In [None]:
# Name columns 
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars=pd.read_csv('imports-85.data',names=cols)

In [None]:
# Explore the data
print(cars.head())
print('\n')
print(cars.info())

## Data Cleaning

In [None]:
# Clean missing data
cars.replace('?',np.nan,inplace=True)

In [None]:
# Prepare numerical columns
numerical_cols=['normalized-losses', 'wheel-base', 'length',
                'width', 'height', 'curb-weight','engine-size', 'bore', 
                'stroke', 'compression-rate','horsepower', 
                'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

# Set these columns in float type
cars_numerical=cars[numerical_cols].astype(float)

In [None]:
# Remove rows with missing price 
cars_numerical.dropna(subset=['price'],inplace=True)

# Replace missing values using the average values of that column
cars_numerical.fillna(cars_numerical.mean(),inplace=True)

In [None]:
# Rescaling the data using min-max normalization except the price column
scaler=MinMaxScaler()
cars_numerical_scaled=scaler.fit_transform(cars_numerical.drop('price',axis=1))
cars_numerical_scaled=pd.DataFrame(cars_numerical_scaled,columns=cars_numerical.columns.drop('price'))


## Univariate Model

In [None]:
# A function that returns the root mean square error using a single feature
def knn_train_test(train_col,target_col):
    X=cars_numerical_scaled[[train_col]]
    y=cars_numerical[target_col]
    
    # Split the data into two halfs: train/test sets
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=1)
    
    # Apply K-neighbor regression
    knn=KNeighborsRegressor()
    knn.fit(X_train,y_train)
    predictions=knn.predict(X_test)
    rmse=mean_squared_error(y_test,predictions)**(1/2)
    return rmse

feature_cols=cars_numerical.columns.drop('price')
one_fea_rmses={}

# Calculate root mean square error for each feature
for col in feature_cols:
    one_fea_rmses[col]=knn_train_test(col,'price')
one_fea_rmses=pd.Series(one_fea_rmses)
one_fea_rmses

In [None]:
# Calculate RMSEs using different k values
def knn_train_test_k(train_col,target_col):
    X=cars_numerical_scaled[[train_col]]
    y=cars_numerical[target_col]
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=1)
    k_values=[1,3,5,7,9]
    k_rmses={}
    for k in k_values:
        
        # Fit model using k-nearest neighbors
        knn=KNeighborsRegressor(n_neighbors=k)
        knn.fit(X_train,y_train)
        predictions=knn.predict(X_test)
        rmse=mean_squared_error(y_test,predictions)**(1/2)
        k_rmses[k]=rmse
    return k_rmses

k_one_fea_rmses={}
for col in feature_cols:
    k_one_fea_rmses[col]=knn_train_test_k(col,'price')
k_one_fea_rmses=pd.DataFrame(k_one_fea_rmses)
k_one_fea_rmses

In [None]:
# Plot RMSE of each feature using different k values
k_one_fea_rmses.plot()
plt.xticks([1,3,5,7,9])
plt.legend(bbox_to_anchor=(1.05, 1),loc='upper left')
plt.xlabel('k value')
plt.ylabel('RMSE')

# Engine-size is the best feature to predict price

## Multivariate Model

In [None]:
# A function that returns RMSEs using multiple best features
def knn_train_test_multi(train_cols,target_col):
    X=cars_numerical_scaled[train_cols]
    y=cars_numerical[target_col]
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=1)
    knn=KNeighborsRegressor()
    knn.fit(X_train,y_train)
    predictions=knn.predict(X_test)
    rmse=mean_squared_error(y_test,predictions)**(1/2)
    return rmse

# Sort best features in ascending order
best_features=one_fea_rmses.sort_values()
multi_fea_rmses={}
for n_fea in range(2,6):
    
    # Return results for best 2, 3, 4, 5 features
    train_cols=best_features.index[:n_fea]
    multi_fea_rmses['{} best features'.format(n_fea)]=knn_train_test_multi(train_cols,'price')
multi_fea_rmses
# Four best feature generate smallest RMSE

# Hyperparameter Tuning

In [None]:
# A function that returns RMSEs using different k values
def knn_train_test_multi_k(train_cols,target_col):
    X=cars_numerical_scaled[train_cols]
    y=cars_numerical[target_col]
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=1)
    k_values=range(1,26)
    k_rmses={}
    for k in k_values:
        
        # Fit the model with k-nearest neighbors
        knn=KNeighborsRegressor(n_neighbors=k)
        knn.fit(X_train,y_train)
        predictions=knn.predict(X_test)
        rmse=mean_squared_error(y_test,predictions)**(1/2)
        k_rmses[k]=rmse
    return k_rmses

k_multi_fea_rmses={}
for n_fea in range(2,6):
    train_cols=best_features.index[:n_fea]
    k_multi_fea_rmses['{} best features'.format(n_fea)]=knn_train_test_multi_k(train_cols,'price')
k_multi_fea_rmses=pd.DataFrame(k_multi_fea_rmses)

In [None]:
k_multi_fea_rmses.plot()
plt.xlabel('k value')
plt.ylabel('RMSE')


## K-fold Cross Validation 

In [None]:
# A function that returns RMSEs using multiple best features
def knn_cross_val(train_col,target_col):
    kf=KFold(n_splits=5, shuffle=True,random_state=1)
    knn=KNeighborsRegressor()
    mses=cross_val_score(knn,cars_numerical_scaled[[train_col]],cars_numerical[target_col],scoring='neg_mean_squared_error',cv=kf)
    rmses = np.sqrt(np.absolute(mses))
    avg_rmse = np.mean(rmses)
    return avg_rmse

cross_val_rmses={}
for col in feature_cols:
    cross_val_rmses[col]=knn_cross_val(col,'price')
cross_val_rmses=pd.Series(cross_val_rmses)
cross_val_rmses

In [None]:
# A function that returns RMSEs for different k-fold values
def knn_cross_val_k_fold(train_col,target_col):
    avg_rmses={}
    for k in range(2,21):
        kf=KFold(n_splits=k, shuffle=True,random_state=1)
        knn=KNeighborsRegressor()
        mses=cross_val_score(knn,cars_numerical_scaled[[train_col]],cars_numerical[target_col],scoring='neg_mean_squared_error',cv=kf)
        rmses = np.sqrt(np.absolute(mses))
        avg_rmse = np.mean(rmses)
        avg_rmses[k]= avg_rmse
    return avg_rmses

k_fold_cross_val_rmses={}
for col in feature_cols:
    k_fold_cross_val_rmses[col]=knn_cross_val_k_fold(col,'price')
k_fold_cross_val_rmses=pd.DataFrame(k_fold_cross_val_rmses)


In [None]:
k_fold_cross_val_rmses.plot()
plt.xticks(range(2,21))
plt.legend(bbox_to_anchor=(1.05, 1),loc='upper left')
plt.xlabel('k value')
plt.ylabel('RMSE')

# Engine-size is proved to be the best feature to predict car price