In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import pydotplus
from sklearn import tree, metrics, model_selection, preprocessing
from IPython.display import Image, display
#from utils import plot_decision

# Data Import and preparation

In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
automobile = fetch_ucirepo(id=10)

# data (as pandas dataframes)
X = automobile.data.features
y = automobile.data.targets

# metadata
print(automobile.metadata)

# variable information
print(automobile.variables)


{'uci_id': 10, 'name': 'Automobile', 'repository_url': 'https://archive.ics.uci.edu/dataset/10/automobile', 'data_url': 'https://archive.ics.uci.edu/static/public/10/data.csv', 'abstract': "From 1985 Ward's Automotive Yearbook", 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 205, 'num_features': 25, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['symboling'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1985, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5B01C', 'creators': ['Jeffrey Schlimmer'], 'intro_paper': None, 'additional_info': {'summary': 'This data set consists of three types of entities: (a) the specification of an auto in terms of various characteristics, (b) its assigned insurance risk rating, (c) its normalized losses in use as compared to other cars.  The second rating corresponds to the degree to which th

In [4]:
X.head()

Unnamed: 0,price,highway-mpg,city-mpg,peak-rpm,horsepower,compression-ratio,stroke,bore,fuel-system,engine-size,...,length,wheel-base,engine-location,drive-wheels,body-style,num-of-doors,aspiration,fuel-type,make,normalized-losses
0,13495.0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,...,168.8,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,
1,16500.0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,...,168.8,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,
2,16500.0,26,19,5000.0,154.0,9.0,3.47,2.68,mpfi,152,...,171.2,94.5,front,rwd,hatchback,2.0,std,gas,alfa-romero,
3,13950.0,30,24,5500.0,102.0,10.0,3.4,3.19,mpfi,109,...,176.6,99.8,front,fwd,sedan,4.0,std,gas,audi,164.0
4,17450.0,22,18,5500.0,115.0,8.0,3.4,3.19,mpfi,136,...,176.6,99.4,front,4wd,sedan,4.0,std,gas,audi,164.0


In [5]:
X['symboling'] = y['symboling']
X.head()

Unnamed: 0,price,highway-mpg,city-mpg,peak-rpm,horsepower,compression-ratio,stroke,bore,fuel-system,engine-size,...,wheel-base,engine-location,drive-wheels,body-style,num-of-doors,aspiration,fuel-type,make,normalized-losses,symboling
0,13495.0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,...,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,,3
1,16500.0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,...,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,,3
2,16500.0,26,19,5000.0,154.0,9.0,3.47,2.68,mpfi,152,...,94.5,front,rwd,hatchback,2.0,std,gas,alfa-romero,,1
3,13950.0,30,24,5500.0,102.0,10.0,3.4,3.19,mpfi,109,...,99.8,front,fwd,sedan,4.0,std,gas,audi,164.0,2
4,17450.0,22,18,5500.0,115.0,8.0,3.4,3.19,mpfi,136,...,99.4,front,4wd,sedan,4.0,std,gas,audi,164.0,2


In [6]:
X.isna().sum()

Unnamed: 0,0
price,4
highway-mpg,0
city-mpg,0
peak-rpm,2
horsepower,2
compression-ratio,0
stroke,4
bore,4
fuel-system,0
engine-size,0


In [7]:
X.dropna(subset=['price'], inplace=True)
X.head()

Unnamed: 0,price,highway-mpg,city-mpg,peak-rpm,horsepower,compression-ratio,stroke,bore,fuel-system,engine-size,...,wheel-base,engine-location,drive-wheels,body-style,num-of-doors,aspiration,fuel-type,make,normalized-losses,symboling
0,13495.0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,...,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,,3
1,16500.0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,...,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,,3
2,16500.0,26,19,5000.0,154.0,9.0,3.47,2.68,mpfi,152,...,94.5,front,rwd,hatchback,2.0,std,gas,alfa-romero,,1
3,13950.0,30,24,5500.0,102.0,10.0,3.4,3.19,mpfi,109,...,99.8,front,fwd,sedan,4.0,std,gas,audi,164.0,2
4,17450.0,22,18,5500.0,115.0,8.0,3.4,3.19,mpfi,136,...,99.4,front,4wd,sedan,4.0,std,gas,audi,164.0,2


In [8]:
X.isna().sum()

Unnamed: 0,0
price,0
highway-mpg,0
city-mpg,0
peak-rpm,2
horsepower,2
compression-ratio,0
stroke,4
bore,4
fuel-system,0
engine-size,0


In [9]:
x = X.drop('price', axis=1)
y = X[['price']]

In [10]:
y.isna().sum()

Unnamed: 0,0
price,0


In [11]:
y.head()

Unnamed: 0,price
0,13495.0
1,16500.0
2,16500.0
3,13950.0
4,17450.0


In [12]:
x_train = x.drop('normalized-losses', axis=1)
x_train

Unnamed: 0,highway-mpg,city-mpg,peak-rpm,horsepower,compression-ratio,stroke,bore,fuel-system,engine-size,num-of-cylinders,...,length,wheel-base,engine-location,drive-wheels,body-style,num-of-doors,aspiration,fuel-type,make,symboling
0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,4,...,168.8,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,3
1,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,4,...,168.8,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,3
2,26,19,5000.0,154.0,9.0,3.47,2.68,mpfi,152,6,...,171.2,94.5,front,rwd,hatchback,2.0,std,gas,alfa-romero,1
3,30,24,5500.0,102.0,10.0,3.40,3.19,mpfi,109,4,...,176.6,99.8,front,fwd,sedan,4.0,std,gas,audi,2
4,22,18,5500.0,115.0,8.0,3.40,3.19,mpfi,136,5,...,176.6,99.4,front,4wd,sedan,4.0,std,gas,audi,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,28,23,5400.0,114.0,9.5,3.15,3.78,mpfi,141,4,...,188.8,109.1,front,rwd,sedan,4.0,std,gas,volvo,-1
201,25,19,5300.0,160.0,8.7,3.15,3.78,mpfi,141,4,...,188.8,109.1,front,rwd,sedan,4.0,turbo,gas,volvo,-1
202,23,18,5500.0,134.0,8.8,2.87,3.58,mpfi,173,6,...,188.8,109.1,front,rwd,sedan,4.0,std,gas,volvo,-1
203,27,26,4800.0,106.0,23.0,3.40,3.01,idi,145,6,...,188.8,109.1,front,rwd,sedan,4.0,turbo,diesel,volvo,-1


In [13]:
x_train = x_train.fillna(method='ffill')
x_train

  x_train = x_train.fillna(method='ffill')


Unnamed: 0,highway-mpg,city-mpg,peak-rpm,horsepower,compression-ratio,stroke,bore,fuel-system,engine-size,num-of-cylinders,...,length,wheel-base,engine-location,drive-wheels,body-style,num-of-doors,aspiration,fuel-type,make,symboling
0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,4,...,168.8,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,3
1,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,4,...,168.8,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,3
2,26,19,5000.0,154.0,9.0,3.47,2.68,mpfi,152,6,...,171.2,94.5,front,rwd,hatchback,2.0,std,gas,alfa-romero,1
3,30,24,5500.0,102.0,10.0,3.40,3.19,mpfi,109,4,...,176.6,99.8,front,fwd,sedan,4.0,std,gas,audi,2
4,22,18,5500.0,115.0,8.0,3.40,3.19,mpfi,136,5,...,176.6,99.4,front,4wd,sedan,4.0,std,gas,audi,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,28,23,5400.0,114.0,9.5,3.15,3.78,mpfi,141,4,...,188.8,109.1,front,rwd,sedan,4.0,std,gas,volvo,-1
201,25,19,5300.0,160.0,8.7,3.15,3.78,mpfi,141,4,...,188.8,109.1,front,rwd,sedan,4.0,turbo,gas,volvo,-1
202,23,18,5500.0,134.0,8.8,2.87,3.58,mpfi,173,6,...,188.8,109.1,front,rwd,sedan,4.0,std,gas,volvo,-1
203,27,26,4800.0,106.0,23.0,3.40,3.01,idi,145,6,...,188.8,109.1,front,rwd,sedan,4.0,turbo,diesel,volvo,-1


In [14]:
x_train.isna().sum()

Unnamed: 0,0
highway-mpg,0
city-mpg,0
peak-rpm,0
horsepower,0
compression-ratio,0
stroke,0
bore,0
fuel-system,0
engine-size,0
num-of-cylinders,0


In [15]:
x_train.dtypes

Unnamed: 0,0
highway-mpg,int64
city-mpg,int64
peak-rpm,float64
horsepower,float64
compression-ratio,float64
stroke,float64
bore,float64
fuel-system,object
engine-size,int64
num-of-cylinders,int64


In [16]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Identify object type columns
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create a ColumnTransformer to apply OneHotEncoder only to categorical features
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), categorical_features),
                  ('scalar', StandardScaler(), numerical_features)],
    remainder='passthrough'  # Keep other columns unchanged
)

# Fit and transform the data
x_train_encoded = ct.fit_transform(x_train)

# Convert the encoded data back to a DataFrame (optional)
x_train_encoded = pd.DataFrame(x_train_encoded,
                              columns=ct.get_feature_names_out())

# Verify the changes
x_train_encoded.head()


Unnamed: 0,encoder__fuel-system_1bbl,encoder__fuel-system_2bbl,encoder__fuel-system_4bbl,encoder__fuel-system_idi,encoder__fuel-system_mfi,encoder__fuel-system_mpfi,encoder__fuel-system_spdi,encoder__fuel-system_spfi,encoder__engine-type_dohc,encoder__engine-type_l,...,scalar__bore,scalar__engine-size,scalar__num-of-cylinders,scalar__curb-weight,scalar__height,scalar__width,scalar__length,scalar__wheel-base,scalar__num-of-doors,scalar__symboling
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.534986,0.075389,-0.34366,-0.014858,-2.034081,-0.85346,-0.439409,-1.685107,-1.156378,1.72505
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.534986,0.075389,-0.34366,-0.014858,-2.034081,-0.85346,-0.439409,-1.685107,-1.156378,1.72505
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-2.394339,0.606234,1.548823,0.51808,-0.559713,-0.185597,-0.244152,-0.710103,-1.156378,0.127193
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.503256,-0.431327,-0.34366,-0.423766,0.218425,0.148335,0.195176,0.165748,0.864769,0.926121
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.503256,0.220165,0.602582,0.520017,0.218425,0.243744,0.195176,0.099646,0.864769,0.926121


In [17]:
# split data randomly into 80% training and 20% test
X_train, X_test, y_train, y_test = model_selection.train_test_split(x_train_encoded, y,
                                                                    test_size=0.2, random_state=0)

In [18]:
y_train.head()

Unnamed: 0,price
69,28176.0
27,8558.0
116,17950.0
172,17669.0
66,18344.0


# Training model

In [22]:
from sklearn.neighbors import KNeighborsRegressor

# Create a KNN Regressor with k=5 (you can adjust k as needed)
knn_regressor = KNeighborsRegressor(n_neighbors=5)

# Fit the KNN Regressor to the training data
knn_regressor.fit(X_train, y_train)

# Testing the model

In [23]:
y_pred = knn_regressor.predict(X_test)

In [29]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
import numpy as np
from sklearn.metrics import r2_score

# Create a KNN Regressor with k=5 (you can adjust k as needed)
knn_regressor = KNeighborsRegressor(n_neighbors=5)

# Perform cross-validation with 5 folds (you can adjust the number of folds)
cv_scores = cross_val_score(knn_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive RMSE scores
rmse_scores = np.sqrt(-cv_scores)

# Get predictions for calculating R-squared
y_pred = cross_val_predict(knn_regressor, X_train, y_train, cv=5)

# Calculate R-squared
r2 = r2_score(y_train, y_pred)

# Print the RMSE scores, average RMSE, and R-squared
print("RMSE scores for each fold:", rmse_scores)
print("Average RMSE:", np.mean(rmse_scores))
print("R-squared:", r2)

RMSE scores for each fold: [4333.65435098 2572.08381327 5294.85687767 2850.31635371 2517.67464563]
Average RMSE: 3513.7172082528655
R-squared: 0.7703138700336044
