In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
# import and preprocess data
url = "abfss://training@sa8451learningdev.dfs.core.windows.net/interpretable_machine_learning/eml_data/Smarket.csv"
Smarket = spark.read.option("header", "true").csv(url).toPandas()
Smarket.set_index('SlNo', inplace=True)

str_cols = ["Direction"]
num_cols = list(set(Smarket.columns) - set(str_cols))
Smarket[str_cols] = Smarket[str_cols].astype(str)
Smarket[num_cols] = Smarket[num_cols].astype(float)

In [0]:
Smarket.head()

In [0]:
Smarket.info()

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X = np.array(Smarket[['Lag1', 'Lag2']])
y = np.array(Smarket['Direction'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2016, random_state=101)

**K-Means without standardisation (K = 1)**

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
knn_1 = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)

In [0]:
knn_1_pred = knn_1.predict(X_test)

In [0]:
from sklearn.metrics import classification_report, confusion_matrix

In [0]:
print(confusion_matrix(y_test, knn_1_pred))

In [0]:
print(classification_report(y_test, knn_1_pred))

**K-Means without standardisation (K = 3)**

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
knn_3 = KNeighborsClassifier().fit(X_train, y_train)

In [0]:
knn_3_pred = knn_3.predict(X_test)

In [0]:
from sklearn.metrics import classification_report, confusion_matrix

In [0]:
print(confusion_matrix(y_test, knn_3_pred))

In [0]:
print(classification_report(y_test, knn_3_pred))

*As we can see, increase the number of K marginally improves the precision of the model.*

**K-Means with standardisation (K = 1)**
<br><br>
**Why standardise?** *Because KNN classifier classifies variables of different sizes, in which distances may vary on an 
absolute scale (e.g. we might be classifying a variable based on house prices (where the distances could be in '000s of 
 £ and age, where the distances could be a few years). Standardisation ensures that these distances are accounted for 
and there "standardised".*

In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
scaler_1 = StandardScaler()

In [0]:
scaler_1.fit(Smarket.drop(columns = 'Direction', axis = 1).astype(float))

In [0]:
scaled_features_1 = scaler_1.transform(Smarket.drop(columns = 'Direction', axis = 1).astype(float))

In [0]:
df_1 = pd.DataFrame(scaled_features_1, columns = Smarket.columns[:-1] )

In [0]:
df_1.head()

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features_1,Smarket['Direction'],
                                                    test_size=0.30)

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
knn_s_1 = KNeighborsClassifier(n_neighbors=1)

In [0]:
knn_s_1.fit(X_train, y_train)

In [0]:
knn_s_1_pred = knn_s_1.predict(X_test)

In [0]:
from sklearn.metrics import classification_report, confusion_matrix

In [0]:
print(confusion_matrix(y_test, knn_s_1_pred))

In [0]:
print(classification_report(y_test, knn_s_1_pred))

**K-Means with standardisation (K = 3)**

In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
scaler_3 = StandardScaler()

In [0]:
scaler_3.fit(Smarket.drop(columns='Direction', axis = 1).astype(float))

In [0]:
scaled_features_3 = scaler_3.transform(Smarket.drop(columns='Direction', axis = 1).astype(float))

In [0]:
df_3 = pd.DataFrame(scaled_features_3, columns = Smarket.columns[:-1] )

In [0]:
df_3.head()

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features_3,Smarket['Direction'],
                                                    test_size=0.30)

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
knn_s_3 = KNeighborsClassifier(n_neighbors=3)

In [0]:
knn_s_3.fit(X_train, y_train)

In [0]:
knn_s_3_pred = knn_s_3.predict(X_test)

In [0]:
from sklearn.metrics import classification_report, confusion_matrix

In [0]:
print(confusion_matrix(y_test, knn_s_3_pred))

In [0]:
print(classification_report(y_test, knn_s_3_pred))

**As we can see, there is a significant improvement in results with standardisation (precision rate of 85% in models with
standardisation as opposed to 47%-48% in models without standardisation).**