In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.utils import to_categorical
import keras_tuner as kt
%matplotlib inline

In [2]:
df = pd.read_csv('data_to_ml.csv')
df.sample(10)

Unnamed: 0,Faction,Class,KB,D,HK,DD,HD,Honor,Win,Lose,Rol,BE,Class Type,Pets,Armor Type
1941,Alliance,Death Knight,7,7,35,125000,36936,580,1,0,dps,0,Melee,1,Plate
2334,Horde,Shaman,1,4,34,14851,92428,351,0,1,heal,1,Both,0,Mail
2004,Horde,Mage,2,4,42,102000,16791,271,0,1,dps,0,Ranged,0,Cloth
2884,Horde,Death Knight,3,5,32,84194,34559,555,1,0,dps,0,Melee,1,Plate
3820,Alliance,Demon Hunter,1,0,31,89847,6296,781,1,0,dps,0,Melee,0,Leather
870,Horde,Mage,0,5,34,25932,2574,500,1,0,dps,0,Ranged,0,Cloth
1808,Horde,Death Knight,4,5,26,90874,34388,513,1,0,dps,0,Melee,1,Plate
4878,Alliance,Demon Hunter,4,2,25,85229,17286,294,0,1,dps,0,Melee,0,Leather
2194,Horde,Rogue,2,1,11,14925,946,413,1,0,dps,1,Melee,0,Leather
4158,Horde,Death Knight,1,3,33,32887,24753,520,1,0,dps,0,Melee,1,Plate


In [3]:
X = df.drop('Faction', axis=1)
y = df['Faction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_columns = ['Class', 'Rol', 'Class Type', 'Armor Type']
encoder = OneHotEncoder(handle_unknown='ignore')

X_train_encoded = encoder.fit_transform(X_train[categorical_columns])
X_test_encoded = encoder.transform(X_test[categorical_columns])

X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns))
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns))

X_train.drop(categorical_columns, axis=1, inplace=True)
X_test.drop(categorical_columns, axis=1, inplace=True)

X_train = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df], axis=1)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

In [4]:
pipeline_minmax = make_pipeline(MinMaxScaler(), LogisticRegression())
param_grid_minmax = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__penalty': ['l2'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'logisticregression__max_iter': [1000, 2000, 3000],
    'logisticregression__class_weight': [None, 'balanced'],
}
grid_search_minmax = GridSearchCV(pipeline_minmax, param_grid_minmax, cv=10, n_jobs=-1, verbose=1)

grid_search_minmax.fit(X_train, y_train_encoded)
y_pred_minmax = grid_search_minmax.predict(X_test)
y_train_pred_minmax = grid_search_minmax.predict(X_train)
accuracy_minmax = accuracy_score(y_test_encoded, y_pred_minmax)

y_test_labels = label_encoder.inverse_transform(y_test_encoded)
y_pred_labels = label_encoder.inverse_transform(y_pred_minmax)
y_train_labels = label_encoder.inverse_transform(y_train_encoded)
y_train_pred_labels = label_encoder.inverse_transform(y_train_pred_minmax)

print("MinMaxScaler best params:", grid_search_minmax.best_params_)
print("MinMaxScaler accuracy:", accuracy_minmax)
print("MinMaxScaler best score:", grid_search_minmax.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test_labels, y_pred_labels))
print("Classification Report for Training Data")
print(classification_report(y_train_labels, y_train_pred_labels))

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
MinMaxScaler best params: {'logisticregression__C': 100, 'logisticregression__class_weight': None, 'logisticregression__max_iter': 100, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'sag'}
MinMaxScaler accuracy: 0.8022284122562674
MinMaxScaler best score: 0.8044714833000594
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.80      0.79      0.80       526
       Horde       0.81      0.81      0.81       551

    accuracy                           0.80      1077
   macro avg       0.80      0.80      0.80      1077
weighted avg       0.80      0.80      0.80      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       0.81      0.81      0.81      2153
       Horde       0.81      0.81      0.81      2153

    accuracy                           0.81      4306
   macro avg      

In [5]:
pipeline_standard = make_pipeline(StandardScaler(), LogisticRegression())
param_grid_standard = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__penalty': ['l2'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'logisticregression__max_iter': [1000, 2000, 3000],
    'logisticregression__class_weight': [None, 'balanced'],
}
grid_search_standard = GridSearchCV(pipeline_standard, param_grid_standard, cv=10, n_jobs=-1, verbose=1)

grid_search_standard.fit(X_train, y_train_encoded)
y_pred_standard = grid_search_standard.predict(X_test)
y_train_pred_standard = grid_search_standard.predict(X_train)
accuracy_standard = accuracy_score(y_test_encoded, y_pred_standard)

y_test_labels = label_encoder.inverse_transform(y_test_encoded)
y_pred_labels = label_encoder.inverse_transform(y_pred_standard)
y_train_labels = label_encoder.inverse_transform(y_train_encoded)
y_train_pred_labels = label_encoder.inverse_transform(y_train_pred_standard)

print("StandardScaler best params:", grid_search_standard.best_params_)
print("StandardScaler accuracy:", accuracy_standard)
print("StandardScaler best score:", grid_search_standard.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test_labels, y_pred_labels))
print("Classification Report for Training Data")
print(classification_report(y_train_labels, y_train_pred_labels))

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
StandardScaler best params: {'logisticregression__C': 10, 'logisticregression__class_weight': None, 'logisticregression__max_iter': 100, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
StandardScaler accuracy: 0.8031569173630455
StandardScaler best score: 0.8044714833000594
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.80      0.79      0.80       526
       Horde       0.81      0.81      0.81       551

    accuracy                           0.80      1077
   macro avg       0.80      0.80      0.80      1077
weighted avg       0.80      0.80      0.80      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       0.81      0.81      0.81      2153
       Horde       0.81      0.81      0.81      2153

    accuracy                           0.81      4306
   macr

In [6]:
param_grid_decision_tree = {
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 8, 64],
    'max_features': [None, 'sqrt', 'log2'],
}

decision_tree = DecisionTreeClassifier()

grid_search = GridSearchCV(decision_tree, param_grid_decision_tree, cv=10, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train_encoded)

y_pred_test = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)

y_test_labels = label_encoder.inverse_transform(y_test_encoded)
y_pred_test_labels = label_encoder.inverse_transform(y_pred_test)
y_train_labels = label_encoder.inverse_transform(y_train_encoded)
y_pred_train_labels = label_encoder.inverse_transform(y_pred_train)

accuracy = accuracy_score(y_test_labels, y_pred_test_labels)

print("DecisionTreeClassifier najlepsze parametry:", grid_search.best_params_)
print("DecisionTreeClassifier accuracy:", accuracy)
print("DecisionTreeClassifier best score:", grid_search.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test_labels, y_pred_test_labels))
print("Classification Report for Training Data")
print(classification_report(y_train_labels, y_pred_train_labels))

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
DecisionTreeClassifier najlepsze parametry: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
DecisionTreeClassifier accuracy: 0.8458681522748375
DecisionTreeClassifier best score: 0.8462639615820429
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.84      0.85      0.84       526
       Horde       0.86      0.84      0.85       551

    accuracy                           0.85      1077
   macro avg       0.85      0.85      0.85      1077
weighted avg       0.85      0.85      0.85      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       0.92      0.93      0.93      2153
       Horde       0.93      0.92      0.92      2153

    accuracy                           0.93      4306
   macro avg       0.93      0.93      0.93      4306
weighted avg       0.9

In [7]:
param_grid_gradient_boosting = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 8, 64],     
    'max_features': [None, 'sqrt', 'log2'],
}

gradient_boosting = GradientBoostingClassifier()

grid_search = GridSearchCV(gradient_boosting, param_grid_gradient_boosting, cv=10, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train_encoded)

y_pred_test = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)

y_test_labels = label_encoder.inverse_transform(y_test_encoded)
y_pred_test_labels = label_encoder.inverse_transform(y_pred_test)
y_train_labels = label_encoder.inverse_transform(y_train_encoded)
y_pred_train_labels = label_encoder.inverse_transform(y_pred_train)

accuracy = accuracy_score(y_test_labels, y_pred_test_labels)

print("GradientBoostingClassifier best params:", grid_search.best_params_)
print("GradientBoostingClassifier accuracy:", accuracy)
print("GradientBoostingClassifier best score:", grid_search.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test_labels, y_pred_test_labels))
print("Classification Report for Training Data")
print(classification_report(y_train_labels, y_pred_train_labels))

Fitting 10 folds for each of 324 candidates, totalling 3240 fits
GradientBoostingClassifier best params: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 8, 'min_samples_split': 5, 'n_estimators': 100}
GradientBoostingClassifier accuracy: 0.8746518105849582
GradientBoostingClassifier best score: 0.8794803863378838
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.89      0.85      0.87       526
       Horde       0.86      0.90      0.88       551

    accuracy                           0.87      1077
   macro avg       0.88      0.87      0.87      1077
weighted avg       0.88      0.87      0.87      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       1.00      1.00      1.00      2153
       Horde       1.00      1.00      1.00      2153

    accuracy                           1.00      4306
   macro avg       1.00      1.00      1.00      430

In [8]:
param_grid_random_forest = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 8, 64],     
    'max_features': [None, 'sqrt', 'log2'],
}
random_forest = RandomForestClassifier()

grid_search = GridSearchCV(random_forest, param_grid_random_forest, cv=10, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train_encoded)

y_pred_test = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)

y_test_labels = label_encoder.inverse_transform(y_test_encoded)
y_pred_test_labels = label_encoder.inverse_transform(y_pred_test)
y_train_labels = label_encoder.inverse_transform(y_train_encoded)
y_pred_train_labels = label_encoder.inverse_transform(y_pred_train)

accuracy = accuracy_score(y_test_labels, y_pred_test_labels)

print("RandomForestClassifier best params:", grid_search.best_params_)
print("RandomForestClassifier accuracy:", accuracy)
print("RandomForestClassifier best score:", grid_search.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test_labels, y_pred_test_labels))
print("Classification Report for Training Data")
print(classification_report(y_train_labels, y_pred_train_labels))

Fitting 10 folds for each of 324 candidates, totalling 3240 fits
RandomForestClassifier best params: {'max_depth': 30, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
RandomForestClassifier accuracy: 0.8792943361188487
RandomForestClassifier best score: 0.8755371499487401
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.90      0.84      0.87       526
       Horde       0.86      0.91      0.89       551

    accuracy                           0.88      1077
   macro avg       0.88      0.88      0.88      1077
weighted avg       0.88      0.88      0.88      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       1.00      1.00      1.00      2153
       Horde       1.00      1.00      1.00      2153

    accuracy                           1.00      4306
   macro avg       1.00      1.00      1.00      4306
weighted a

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    model.add(Dense(units=hp.Int('neurons', min_value=1, max_value=10, step=1),
                    kernel_initializer=hp.Choice('init', values=['glorot_uniform', 'he_normal']),
                    activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.0, max_value=0.2, step=0.1)))
    model.add(Dense(len(label_encoder.classes_), kernel_initializer='glorot_uniform', activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=hp.Choice('optimizer', values=['adam', 'SGD']),
                  metrics=['accuracy'])
    return model

tuner = kt.RandomSearch(build_model,
                        objective='val_accuracy',
                        max_trials=10,
                        executions_per_trial=3,
                        directory='keras_tuner_dir',
                        project_name='my_project')

tuner.search(X_train_scaled, y_train_categorical, epochs=500, validation_split=0.2)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

model = build_model(best_hps)

epochs = best_hps.get('epochs') if 'epochs' in best_hps.values else 500
batch_size = best_hps.get('batch_size') if 'batch_size' in best_hps.values else 10

history = model.fit(X_train_scaled, y_train_categorical, epochs=epochs, batch_size=batch_size, validation_split=0.2)

y_pred_test_encoded = model.predict(X_test_scaled)
y_pred_train_encoded = model.predict(X_train_scaled)

y_pred_test_num = np.argmax(y_pred_test_encoded, axis=1)
y_pred_train_num = np.argmax(y_pred_train_encoded, axis=1)

y_test_labels = label_encoder.inverse_transform(y_test_encoded)
y_pred_test_labels = label_encoder.inverse_transform(y_pred_test_num)
y_train_labels = label_encoder.inverse_transform(y_train_encoded)
y_pred_train_labels = label_encoder.inverse_transform(y_pred_train_num)

accuracy = accuracy_score(y_test_labels, y_pred_test_labels)

print("Best parameters found: ", best_hps.values)
print("Test accuracy: ", accuracy)
print("Classification Report for Test Data")
print(classification_report(y_test_labels, y_pred_test_labels))
print("Classification Report for Training Data")
print(classification_report(y_train_labels, y_pred_train_labels))

Reloading Tuner from keras_tuner_dir\my_project\tuner0.json
Epoch 1/500
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5542 - loss: 0.8086 - val_accuracy: 0.6833 - val_loss: 0.5986
Epoch 2/500
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6693 - loss: 0.6160 - val_accuracy: 0.7436 - val_loss: 0.5465
Epoch 3/500
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6972 - loss: 0.5791 - val_accuracy: 0.7680 - val_loss: 0.5083
Epoch 4/500
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7250 - loss: 0.5446 - val_accuracy: 0.7715 - val_loss: 0.4879
Epoch 5/500
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7634 - loss: 0.5002 - val_accuracy: 0.7784 - val_loss: 0.4679
Epoch 6/500
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7634 - loss: 0.4932 - va