In [None]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
from copy import deepcopy

# Import danych i preprocessing

In [None]:
cars = pd.read_csv("data/car.data").to_numpy()
le = preprocessing.LabelEncoder()

In [None]:
le.fit(["low", "med", "high", "vhigh"]) 
cars[:,0] = le.transform(cars[:,0]) # buying
cars[:,1] = le.transform(cars[:,1]) # maintenance

In [None]:
le.fit(["2", "3", "4", "5more"])
cars[:,2] = le.transform(cars[:,2]) # doors

In [None]:
le.fit(["2", "4", "more"])
cars[:,3] = le.transform(cars[:,3]) # persons

In [None]:
le.fit(["small", "med", "big"])
cars[:,4] = le.transform(cars[:,4]) # lug_boot

In [None]:
le.fit(["low", "med", "high"])
cars[:,5] = le.transform(cars[:,5]) # safety

# Klasyfikacja bez optymalizacji hiperparametrów

In [None]:
cars_X = cars[:, 0:-1]
cars_y = cars[:, -1]
X_train, X_test, y_train, y_test = train_test_split(cars_X, cars_y, test_size=0.20)

In [None]:
clf = RandomForestClassifier(random_state = 0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
accuracy_score(y_test, y_pred)

# Klasyfikacja z optymalizacją parametrów i walidacją krzyżową (wbudowaną w GridSearchCV)

## Na oryginalnych danych

In [None]:
grid_params = {'max_depth':[2, 5, 10, 20, 100], 'max_leaf_nodes':[2,3,None], 'min_samples_leaf': [1, 2, 4, 6]}
gcv = GridSearchCV(RandomForestClassifier(), grid_params, cv=5, scoring='accuracy') # parametr cv=x oznacza x-krotną walidację krzyżową 

In [None]:
gcv.fit(X_train, y_train)
y_pred = gcv.predict(X_test)

print(metrics.classification_report(y_test, y_pred))    
print("Najlepsze parametry:", gcv.best_params_)
print("ACC = ", metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn import metrics
metrics.plot_confusion_matrix(gcv.best_estimator_, X_test, y_test)

## Na danych zestandaryzowanych 

In [None]:
def standarize_columns(columns, data):
    for index in columns:
        data[:, index:index+1] = StandardScaler().fit_transform(data[:,index:index+1])
    return data 

In [None]:
X_train_std = standarize_columns([2, 3], deepcopy(X_train))
X_test_std = standarize_columns([2, 3], deepcopy(X_test))
y_train_std = y_train
y_test_std = y_test

In [None]:
gcv.fit(X_train_std, y_train_std)
y_pred_std = gcv.predict(X_test_std)

print(metrics.classification_report(y_test_std, y_pred_std))
print("Najlepsze parametry:", gcv.best_params_)
print("ACC = ", metrics.accuracy_score(y_test_std, y_pred_std))
metrics.plot_confusion_matrix(gcv.best_estimator_, X_test_std, y_test_std)

## Po normalizacji danych

In [None]:
def normalize_columns(columns, data):
    for index in columns:
        data[:, index:index+1] = normalize(data[:, index:index+1], axis=0)
    return data 

In [None]:
X_train_std = normalize_columns([2, 3], deepcopy(X_train))
X_test_std = normalize_columns([2, 3], deepcopy(X_test))
y_train_std = y_train
y_test_std = y_test

In [None]:
gcv.fit(X_train, y_train)
y_pred = gcv.predict(X_test)

print(metrics.classification_report(y_test, y_pred))    
print("Najlepsze parametry:", gcv.best_params_)
print("ACC = ", metrics.accuracy_score(y_test, y_pred))
metrics.plot_confusion_matrix(gcv.best_estimator_, X_test, y_test)

## Na danych po redukcji wymiarowości (n_components = 5)

In [None]:
pca = PCA(n_components=5)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
gcv.fit(X_train_pca, y_train)
y_pred_pca = gcv.predict(X_test_pca)

print("Najlepsze parametry:", gcv.best_params_)
print("ACC = ", metrics.accuracy_score(y_test, y_pred_pca))
print(classification_report(y_test, y_pred_pca))

In [None]:
metrics.plot_confusion_matrix(gcv.best_estimator_, X_test_pca, y_pred_pca)

## Na danych po redukcji wymiarowości (n_components = 4)

In [None]:
pca = PCA(n_components=4)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
gcv.fit(X_train_pca, y_train)
y_pred_pca = gcv.predict(X_test_pca)

print("Najlepsze parametry:", gcv.best_params_)
print("ACC = ", metrics.accuracy_score(y_test, y_pred_pca))
print(classification_report(y_test, y_pred_pca))

In [None]:
metrics.plot_confusion_matrix(gcv.best_estimator_, X_test_pca, y_pred_pca)