In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv('Assignment 2 Advertising.csv', index_col=0)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [3]:
df.shape

(200, 4)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


In [5]:
df.describe(include='all')

Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


In [6]:
df.isnull().sum()

TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64

## Data Transformation

### One - hot encoding

In [7]:
df_new = pd.get_dummies(df, drop_first=True)
df_new.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [8]:
df_new.shape

(200, 4)

## Data Splitting

In [9]:
X = df_new.drop('Sales', axis=1)
Y = df_new.Sales

In [10]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=100)

## Data Scaling

In [11]:
train_y = train_y.to_numpy().reshape(-1, 1)
test_y = test_y.to_numpy().reshape(-1, 1)

In [12]:
scale_x = MinMaxScaler().fit(train_x)
scale_y = MinMaxScaler().fit(train_y)

train_x = scale_x.transform(train_x)
train_y = scale_y.transform(train_y)

In [13]:
tran_x = StandardScaler().fit(train_x)
tran_y = StandardScaler().fit(train_y)

train_x = tran_x.transform(train_x)
train_y = tran_y.transform(train_y)

In [14]:
test_x = scale_x.transform(test_x)
test_x = tran_x.transform(test_x)

test_y = scale_y.transform(test_y)
test_y = tran_y.transform(test_y)

## Model

In [15]:
# para = {
#     'n_neighbors': [3, 5, 7, 12],
#     'weights': ['uniform', 'distance'],
#     'metric': ['minkowski', 'manhattan', 'euclidean']
# }

para =  {
    "n_neighbors": np.arange(1,21,2), #odd values of k from 1 to 20
    "weights": ["uniform", "distance"], 
    "metric": ["eucledian", "manhattan", "minkowski"]
}

In [16]:
dia_reg = GridSearchCV(KNeighborsRegressor(), para, cv=10)

In [17]:
dia_reg.fit(train_x, train_y)

200 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\pandi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\pandi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\pandi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\pandi\AppData\Local\Programs\Python\Python310\l

In [18]:
dia_reg.best_score_

np.float64(0.942829208129125)

In [19]:
dia_reg.best_params_

{'metric': 'manhattan', 'n_neighbors': np.int64(5), 'weights': 'distance'}

In [20]:
reg = KNeighborsRegressor(**dia_reg.best_params_)

In [21]:
reg.fit(train_x, train_y)

In [22]:
pred = reg.predict(test_x)

## Evaluation

In [23]:
r2_score(test_y, pred)

0.9688523411980376

In [24]:
test_y = tran_y.inverse_transform(test_y)
test_y = scale_y.inverse_transform(test_y)

pred = tran_y.inverse_transform(pred)
pred = scale_y.inverse_transform(pred)

In [25]:
r2_score(pred, test_y)

0.966776604557301

In [26]:
mean_absolute_error(pred, test_y)

0.5597207857021125

In [27]:
mean_squared_error(pred, test_y)

0.6619171451446937