# Homework # 3 - K-Nearest Neighbor - Solution
Data file: social_network_ads.csv

### Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Load data

In [2]:
! ls -l 'data/social_network_ads.csv'

-rw-rw-r--@ 1 vj  staff  10926 Apr  7  2020 data/social_network_ads.csv


In [3]:
# Read CSV into dataframe
df = pd.read_csv('data/social_network_ads.csv')
df.shape

(400, 5)

In [4]:
# Display first few rows
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


### Drop unnecessary columns 'User ID' and 'Gender'

In [5]:
df.drop(['User ID', 'Gender'], axis=1, inplace=True)
df.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


### Explore data

In [6]:
# Display distribution (min, max) of values for Age and EstimatedSalary variables
print('Age: min={}, max={}'.format(df['Age'].min(), df['Age'].max()))
print('EstimatedSalary: min={}, max={}'.format(df['EstimatedSalary'].min(), df['EstimatedSalary'].max()))

Age: min=18, max=60
EstimatedSalary: min=15000, max=150000


#### Note the scale for column EstimatedSalary is much higher than for column Age

### Separate independent and dependent variables
* Independent variables: All except Purchased
* Dependent variable: Purchased

In [7]:
# Prepare data for training and testing
X = df.drop("Purchased", axis = 1)
y = df["Purchased"]

### Split data into training and test sets

In [8]:
# Split into training (70%) and test data (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 52)

### Scale Age and EstimatedSalary variables

In [9]:
# Use StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:
# Display distribution (min, max) of scaled values for Age and EstimatedSalary variables
print('Scaled Age (training): min={}, max={}'.format(X_train[1].min(), X_train[1].max()))
print('Scaled EstimatedSalary (training): min={}, max={}'.format(X_train[0].min(), X_train[0].max()))
print()
print('Scaled Age (test): min={}, max={}'.format(X_test[1].min(), X_test[1].max()))
print('Scaled EstimatedSalary (test): min={}, max={}'.format(X_test[0].min(), X_test[0].max()))

Scaled Age (training): min=-0.3669544783944666, max=0.19782108226835948
Scaled EstimatedSalary (training): min=-0.5427911936547929, max=-0.4660989883583254

Scaled Age (test): min=-0.42556671681457536, max=-0.08671609085736262
Scaled EstimatedSalary (test): min=-1.0351733346097696, max=-0.45487283602462975


### Train KNeighborsClassifier (with default hyperparameters)

In [11]:
# Defaults: n_neighbors=5, weights='uniform' metric='Euclidean'
model = KNeighborsClassifier()
model.fit(X_train, y_train)

KNeighborsClassifier()

### Evaulate model performance

In [12]:
# Predict using the Test set results
y_pred = model.predict(X_test)

In [13]:
# Display the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[69,  4],
       [ 8, 39]])

In [14]:
# Generate and print model performance metrics (accuracy) on test set
print('*************** Evaluation on Test Data ***************')
score_test = model.score(X_test, y_test)
print('Accuracy Score: ', score_test)

*************** Evaluation on Test Data ***************
Accuracy Score:  0.9


In [15]:
# Generate classification report to evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        73
           1       0.91      0.83      0.87        47

    accuracy                           0.90       120
   macro avg       0.90      0.89      0.89       120
weighted avg       0.90      0.90      0.90       120



### Train KNeighborsClassifier (change hyperparameter: n_neighbors)

In [16]:
# Defaults: n_neighbors=5, weights='uniform' metric='Euclidean'
num_neighbors=3
model = KNeighborsClassifier(n_neighbors=num_neighbors)
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

### Evaulate model performance

In [17]:
# Predict using the Test set results
y_pred = model.predict(X_test)

In [18]:
# Display the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[68,  5],
       [ 9, 38]])

In [19]:
# Generate and print model performance metrics (accuracy) on test set
print('*************** Evaluation on Test Data ***************')
score_test = model.score(X_test, y_test)
print('Accuracy Score: ', score_test)

*************** Evaluation on Test Data ***************
Accuracy Score:  0.8833333333333333


In [20]:
# Generate classification report to evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.91        73
           1       0.88      0.81      0.84        47

    accuracy                           0.88       120
   macro avg       0.88      0.87      0.88       120
weighted avg       0.88      0.88      0.88       120

