In [1]:
import pandas as pd 
import matplotlib.pyplot as  plt
import numpy as np
from sklearn import preprocessing

# Read datafile with 20 descriptors and label is the last column.
df = pd.read_csv("games_clean.csv") 

In [2]:
df.head()

Unnamed: 0,Presence,Memory,ReleaseDate,OriginalCost,DiscountedCost,Achievements,Storage,RatingsBreakdown-Recommended,RatingsBreakdown-Meh,RatingsBreakdown-Exceptional,...,Web Publishing,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,eSports
0,333.0,512.0,2019.0,1.99,0.0,0.0,200.0,2.0,2.0,2.0,...,False,False,False,False,False,False,False,False,False,False
1,363.0,1000.0,2016.0,2.99,0.0,0.0,350.0,1.0,1.0,2.0,...,False,False,False,False,False,False,False,False,False,False
2,102.0,1024.0,2019.0,0.99,0.0,0.0,122.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
3,269.0,4000.0,2019.0,0.0,0.0,0.0,300.0,2.0,2.0,1.0,...,False,False,False,False,False,False,False,False,False,False
4,217.0,4000.0,2016.0,11.99,0.0,18.0,4000.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [3]:
# Let's separate Data from labels

X = df.drop(['21'],axis=1).as_matrix()         # Data
y = df['21'].as_matrix().astype(int)           # Labels

print(X.shape)
print(y.shape)

KeyError: "['21'] not found in axis"

## 1- Impact of normalization in KNN

In [None]:
# Notice that only two last features are good to separate data.
# Let's find the 10-fold cross-validation with all columns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import sklearn.neighbors as nb

cv = StratifiedKFold(n_splits=10, random_state=1) 

cv_scores = cross_val_score(nb.KNeighborsClassifier(), X=X,y=y, cv=cv, scoring='accuracy')  
np.mean(cv_scores)

In [None]:
# Poor score. What could happen? 
# Remeber that all columns should be in the same range for KNN!!
# Could be that each colum is in a different range?

for i in range(20):
    print('std:',X[:,i].std(),'min',X[:,i].min(),'max',X[:,i].max())

In [None]:
# Let's solve that
from sklearn import preprocessing

# One way is to standarize all data mean 0, std 1
scaler = preprocessing.StandardScaler().fit(X)
X2=scaler.transform(X)

for i in range(20):
    print('std:',X2[:,i].std(),'min',X2[:,i].min(),'max',X2[:,i].max())

In [None]:
# Let's repeat the 10-fold Cross-validation with new data X2
cv_scores = cross_val_score(nb.KNeighborsClassifier(), X=X2,y=y, cv=cv, scoring='accuracy')  
np.mean(cv_scores)


## 2- Effect of irrelevant columns

In [None]:
# Let's build a pandas dataframe to work with columns

columns = [str(i+1) for i in range(20)]
df=pd.DataFrame(data=X2, columns=columns)
df['21']=y
df.head()

In [None]:
# Let's plot a histogram of values in column 1 grouped by the label

df[df['21']==0]['1'].plot.hist(bins=10)
df[df['21']==1]['1'].plot.hist(bins=10)
plt.show()

In [None]:
# It seems that feature 1 does not help to separate data according to labels.
# Let's see the same for all the features

plt.subplots(figsize=(10, 10))
plt.subplots_adjust(hspace=0.27,wspace=0.5)
for i in range(20):
    plt.subplot(5,4,0+i+1)
    df[df['21']==0][str(i+1)].plot.hist(bins=10)
    df[df['21']==1][str(i+1)].plot.hist(bins=10)
plt.show()

In [None]:
# Let's repeat the 10-fold Cross-validation with new data X2
cv_scores = cross_val_score(nb.KNeighborsClassifier(), X=X2,y=y, cv=cv, scoring='accuracy')  
np.mean(cv_scores)


In [None]:
# Let's find the 10-fold cross-validation with only 2 last columns

cv_scores = cross_val_score(nb.KNeighborsClassifier(), X=X2[:,18:],y=y, cv=cv, scoring='accuracy')  
np.mean(cv_scores)

In [None]:
# Improvement in accuracy
# Unfortunately, we don't know before hand the relevant feature.

# Sklearn to the rescue!

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Select k best features following a given measure. Fit that on whole data set and return only relevant columns 
X_new = SelectKBest(mutual_info_classif, k=2).fit_transform(X, y)

# Let's do now the 10-fold cross-validation again
cv_scores = cross_val_score(nb.KNeighborsClassifier(), X=X_new,y=y, cv=cv, scoring='accuracy')  
np.mean(cv_scores)

In [None]:
original = np.zeros(20)
for i in range(20):
    X_new = SelectKBest(mutual_info_classif, k=i+1).fit_transform(X2, y)
    cv_scores = cross_val_score(nb.KNeighborsClassifier(), X=X_new,y=y, cv=cv, scoring='accuracy')  
    original[i]=np.mean(cv_scores)

plt.xticks(np.arange(0, 20, step=1))
plt.plot(range(1,21),original)
plt.show()

## 3- Let's find best parameters

In [None]:
X_new = SelectKBest(mutual_info_classif, k=2).fit_transform(X, y)

lr = []
for ki in range(1,30,2):
    cv_scores = cross_val_score(nb.KNeighborsClassifier(n_neighbors=ki), X=X_new, y=y, cv=10)
    lr.append(np.mean(cv_scores))
plt.plot(range(1,30,2),lr,'b',label='No weighting')

lr = []
for ki in range(1,30,2):
    cv_scores = cross_val_score(nb.KNeighborsClassifier(n_neighbors=ki,weights='distance'), X=X_new, y=y, cv=10)
    lr.append(np.mean(cv_scores))
plt.plot(range(1,30,2),lr,'r',label='Weighting')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.legend(loc='upper right')
plt.grid()
plt.tight_layout()

plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'n_neighbors':list(range(1,30,2)), 'weights':('distance','uniform')}
knc = nb.KNeighborsClassifier()
clf = GridSearchCV(knc, param_grid=params,cv=cv,n_jobs=-1)  # If cv is integer, by default is Stratifyed 
clf.fit(X_new, y)
print("Best Params=",clf.best_params_, "Accuracy=", clf.best_score_)

## What about PCA for dimensionality reduction?

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X2)
X_pca = pca.transform(X2)

In [None]:
params = {'n_neighbors':list(range(1,30,2)), 'weights':('distance','uniform')}
knc = nb.KNeighborsClassifier()
clf = GridSearchCV(knc, param_grid=params,cv=cv,n_jobs=-1)  # If cv is integer, by default is Stratifyed 
clf.fit(X_pca, y)
print("Best Params=",clf.best_params_, "Accuracy=", clf.best_score_)

In [None]:
# Not very good because does not consider labels in the reduction. Let's try LDA that is similar to PCA
# but takes into account labels

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit(X2, y).transform(X2)

params = {'n_neighbors':list(range(1,30,2)), 'weights':('distance','uniform')}
knc = nb.KNeighborsClassifier()
clf = GridSearchCV(knc, param_grid=params,cv=cv,n_jobs=-1)  # If cv is integer, by default is Stratifyed 
clf.fit(X_lda, y)
print("Best Params=",clf.best_params_, "Accuracy=", clf.best_score_)