In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import time

In [2]:
df = pd.read_csv("letter-recognition.csv")
df[0:2]

Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10


In [7]:
df.shape

(20000, 17)

In [None]:
df.letter.value_counts().sort_index().plot(kind="bar");

In [None]:
letter = df.letter

df = df.drop("letter", axis=1)

df=(df-df.min())/(df.max()-df.min())

df["letter"] = letter

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(16, 8))
sns.barplot(x=df["letter"], y=df["x2ybar"], 
            data=df,
            order=list(np.sort(df['letter'].unique())));

In [None]:
plt.figure(figsize=(16, 8))
sns.barplot(x=df["letter"], y=df["xy2bar"], 
            data=df,
            order=list(np.sort(df['letter'].unique())));

In [None]:
np.random.seed(100)

X = df.drop("letter", axis=1)
y = df["letter"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
models = {"KNN": KNeighborsClassifier(n_neighbors=5), # 5 sąsiadów to wartość domyślna, wpisane dla jasności dalszych działań
          "Sieć neuronowa": MLPClassifier()}

def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(100)
    model_scores = {}
    
    for name, model in models.items():
        
        model.fit(X_train, y_train)
        
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models, X_train, X_test, y_train, y_test)
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=['accuracy'])
model_compare.T.plot.bar();

In [None]:
test_scores = []

neighbors = range(2, 21) # 1 to 20

knn = KNeighborsClassifier()

for i in neighbors:
    knn.set_params(n_neighbors = i)
    
    knn.fit(X_train, y_train)
    
    test_scores.append(knn.score(X_test, y_test))

In [None]:
plt.plot(neighbors, test_scores)
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Liczba sąsiadów")
plt.ylabel("Wynik Modelu")

print(f"Maksymalny wynik KNN: {max(test_scores)*100:.2f}%")

In [None]:
np.random.seed(100)
# różne parametry KNeighborsClassifier
knn_grid = {"n_neighbors": [3],
            "weights": ["uniform", "distance"], 
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "leaf_size": [15, 20, 30, 50, 100],
            "n_jobs": [-1]} # Parametr ustalający użycie procesorów, "-1" oznacza użycie wszystkich procesorów

gs_knn = GridSearchCV(KNeighborsClassifier(),
                             param_grid=knn_grid,
                             verbose=True)
# Trenowanie z każdą kombinacją paramterów
gs_knn.fit(X_train, y_train);

In [None]:
gs_knn.best_params_

In [None]:
gs_knn.score(X_test, y_test)

In [None]:
np.random.seed(100)
# różne paramtetry MLPClassifier
net_grid = {"solver": ["adam"],
                "learning_rate": ["adaptive"],
                "max_iter": [5000, 10000]}

gs_net = GridSearchCV(MLPClassifier(),
                          param_grid=net_grid,
                          verbose=True)

# Trenowanie z każdą kombinacją paramterów
gs_net.fit(X_train, y_train);

In [None]:
gs_net.best_params_

In [None]:
gs_net.score(X_test, y_test)

In [None]:
y_preds = gs_knn.predict(X_test)
print(classification_report(y_test, y_preds))

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
metrics = ['Accuracy', 'Precission', 'Recall', 'F1Score']
values = [accuracy_score(y_test, y_preds),
          precision_score(y_test, y_preds, average="weighted"),
          recall_score(y_test, y_preds, average="weighted"),
          f1_score(y_test, y_preds, average="weighted")]
ax.bar(metrics, values)
plt.ylim(0.95875, 0.96)
plt.show()