In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
%matplotlib inline

In [38]:
df = pd.read_csv('data_to_ml.csv')
df.sample(10)

Unnamed: 0,Faction,Class,KB,D,HK,DD,HD,Honor,Win,Lose,Rol,BE,Class Type,Pets,Armor Type
1288,Horde,Paladin,1,2,30,9302,128000,367,1,0,heal,0,Both,0,Plate
2832,Horde,Rogue,1,8,16,26180,17830,218,0,1,dps,0,Melee,0,Leather
5379,Alliance,Mage,1,4,12,26117,11492,291,0,1,dps,1,Ranged,0,Cloth
433,Horde,Druid,0,0,20,743,64975,359,1,0,heal,0,Both,0,Leather
891,Alliance,Warrior,2,4,27,21209,8950,253,0,1,dps,0,Melee,0,Plate
1357,Alliance,Death Knight,1,5,19,28508,71025,351,0,1,dps,0,Melee,1,Plate
1390,Horde,Paladin,3,3,29,52299,20458,426,1,0,dps,0,Both,0,Plate
1265,Horde,Mage,5,3,24,48825,1987,164,0,1,dps,0,Ranged,0,Cloth
1441,Horde,Mage,7,0,26,22797,5377,501,1,0,dps,0,Ranged,0,Cloth
3107,Horde,Shaman,0,0,36,5009,44391,394,1,0,heal,0,Both,0,Mail


In [39]:
X = df.drop('Faction', axis=1)
y = df['Faction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_columns = ['Class', 'Rol', 'Class Type', 'Armor Type']

encoder = OneHotEncoder(handle_unknown='ignore')

X_train_encoded = encoder.fit_transform(X_train[categorical_columns])

X_test_encoded = encoder.transform(X_test[categorical_columns])

X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns))
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns))

X_train.drop(categorical_columns, axis=1, inplace=True)
X_test.drop(categorical_columns, axis=1, inplace=True)

X_train = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df], axis=1)

In [40]:
pipeline_minmax = make_pipeline(MinMaxScaler(), LogisticRegression())
param_grid_minmax = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__penalty': ['l2'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'logisticregression__max_iter': [100, 200, 300],
    'logisticregression__class_weight': [None, 'balanced'],
}
grid_search_minmax = GridSearchCV(pipeline_minmax, param_grid_minmax, cv=10, n_jobs=-1)

grid_search_minmax.fit(X_train, y_train)
y_pred_minmax = grid_search_minmax.predict(X_test)
y_train_pred_minmax = grid_search_minmax.predict(X_train)
accuracy_minmax = accuracy_score(y_test, y_pred_minmax)

print("MinMaxScaler best params:", grid_search_minmax.best_params_)
print("MinMaxScaler accuracy:", accuracy_minmax)
print("MinMaxScaler best score:", grid_search_minmax.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test, y_pred_minmax))
print("Classification Report for Training Data")
print(classification_report(y_train, y_train_pred_minmax))

MinMaxScaler best params: {'logisticregression__C': 100, 'logisticregression__class_weight': 'balanced', 'logisticregression__max_iter': 300, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'sag'}
MinMaxScaler accuracy: 0.8022284122562674
MinMaxScaler best score: 0.8042394647385744
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.80      0.79      0.80       526
       Horde       0.81      0.81      0.81       551

    accuracy                           0.80      1077
   macro avg       0.80      0.80      0.80      1077
weighted avg       0.80      0.80      0.80      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       0.81      0.81      0.81      2153
       Horde       0.81      0.82      0.81      2153

    accuracy                           0.81      4306
   macro avg       0.81      0.81      0.81      4306
weighted avg       0.81

In [41]:
pipeline_standard = make_pipeline(StandardScaler(), LogisticRegression())
param_grid_standard = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__penalty': ['l2'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'logisticregression__max_iter': [100, 200, 300],
    'logisticregression__class_weight': [None, 'balanced'],
}
grid_search_standard = GridSearchCV(pipeline_standard, param_grid_standard, cv=10, n_jobs=-1)

grid_search_standard.fit(X_train, y_train)
y_pred_standard = grid_search_standard.predict(X_test)
y_train_pred_standard = grid_search_standard.predict(X_train)
accuracy_standard = accuracy_score(y_test, y_pred_standard)

print("StandardScaler best params:", grid_search_standard.best_params_)
print("StandardScaler accuracy:", accuracy_standard)
print("StandardScaler best score:", grid_search_standard.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test, y_pred_standard))
print("Classification Report for Training Data")
print(classification_report(y_train, y_train_pred_standard))

StandardScaler best params: {'logisticregression__C': 10, 'logisticregression__class_weight': None, 'logisticregression__max_iter': 100, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
StandardScaler accuracy: 0.8031569173630455
StandardScaler best score: 0.8044714833000594
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.80      0.79      0.80       526
       Horde       0.81      0.81      0.81       551

    accuracy                           0.80      1077
   macro avg       0.80      0.80      0.80      1077
weighted avg       0.80      0.80      0.80      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       0.81      0.81      0.81      2153
       Horde       0.81      0.81      0.81      2153

    accuracy                           0.81      4306
   macro avg       0.81      0.81      0.81      4306
weighted avg      

In [42]:
param_grid_decision_tree = {
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 8, 64],
    'max_features': [None, 'sqrt', 'log2'],
}

decision_tree = DecisionTreeClassifier()

grid_search = GridSearchCV(decision_tree, param_grid_decision_tree, cv=10, n_jobs=-1)

grid_search.fit(X_train, y_train)

y_pred_test = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)

accuracy = accuracy_score(y_test, y_pred_test)

print("DecisionTreeClassifier najlepsze parametry:", grid_search.best_params_)
print("DecisionTreeClassifier accuracy:", accuracy)
print("DecisionTreeCLassifier best score:", grid_search.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test, y_pred_test))
print("Classification Report for Training Data")
print(classification_report(y_train, y_pred_train))

DecisionTreeClassifier najlepsze parametry: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
DecisionTreeClassifier accuracy: 0.8458681522748375
DecisionTreeCLassifier best score: 0.845336966492203
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.83      0.85      0.84       526
       Horde       0.86      0.84      0.85       551

    accuracy                           0.85      1077
   macro avg       0.85      0.85      0.85      1077
weighted avg       0.85      0.85      0.85      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       0.92      0.93      0.93      2153
       Horde       0.93      0.92      0.93      2153

    accuracy                           0.93      4306
   macro avg       0.93      0.93      0.93      4306
weighted avg       0.93      0.93      0.93      4306



In [43]:
param_grid_gradient_boosting = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 8, 64],     
    'max_features': [None, 'sqrt', 'log2'],
}

gradient_boosting = GradientBoostingClassifier()

grid_search = GridSearchCV(gradient_boosting, param_grid_gradient_boosting, cv=10, n_jobs=-1)

grid_search.fit(X_train, y_train)

y_pred_test = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)

accuracy = accuracy_score(y_test, y_pred_test)

print("GradientBoostingClassifier best params:", grid_search.best_params_)
print("GradientBoostingClassifier accuracy:", accuracy)
print("GradientBoostingClassifier best score:", grid_search.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test, y_pred_test))
print("Classification Report for Training Data")
print(classification_report(y_train, y_pred_train))

GradientBoostingClassifier best params: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 8, 'min_samples_split': 5, 'n_estimators': 50}
GradientBoostingClassifier accuracy: 0.8737233054781801
GradientBoostingClassifier best score: 0.8783159769060594
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.89      0.85      0.87       526
       Horde       0.86      0.90      0.88       551

    accuracy                           0.87      1077
   macro avg       0.87      0.87      0.87      1077
weighted avg       0.87      0.87      0.87      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       0.99      0.99      0.99      2153
       Horde       0.99      0.99      0.99      2153

    accuracy                           0.99      4306
   macro avg       0.99      0.99      0.99      4306
weighted avg       0.99      0.99      0.99      4306



In [44]:
param_grid_random_forest = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 8, 64],     
    'max_features': [None, 'sqrt', 'log2'],
}
random_forest = RandomForestClassifier()

grid_search = GridSearchCV(random_forest, param_grid_random_forest, cv=10, n_jobs=-1)

grid_search.fit(X_train, y_train)

y_pred_test = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)

accuracy = accuracy_score(y_test, y_pred_test)

print("RandomForestClassifier best params:", grid_search.best_params_)
print("RandomForestClassifier accuracy:", accuracy)
print("RandomForestClassifier best score:", grid_search.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test, y_pred_test))
print("Classification Report for Training Data")
print(classification_report(y_train, y_pred_train))

RandomForestClassifier best params: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
RandomForestClassifier accuracy: 0.8783658310120706
RandomForestClassifier best score: 0.8753051313872552
Classification Report for Test Data
              precision    recall  f1-score   support

    Alliance       0.89      0.85      0.87       526
       Horde       0.86      0.90      0.88       551

    accuracy                           0.88      1077
   macro avg       0.88      0.88      0.88      1077
weighted avg       0.88      0.88      0.88      1077

Classification Report for Training Data
              precision    recall  f1-score   support

    Alliance       0.99      0.99      0.99      2153
       Horde       0.99      0.99      0.99      2153

    accuracy                           0.99      4306
   macro avg       0.99      0.99      0.99      4306
weighted avg       0.99      0.99      0.99      4306



In [45]:
def create_model(optimizer='adam', init='glorot_uniform', dropout_rate=0.0, neurons=1):
    model = Sequential()
    model.add(Dense(neurons, input_dim=4, kernel_initializer=init, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(3, kernel_initializer=init, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, verbose=0)
param_grid = {
    'batch_size': [10, 20, 40],
    'epochs': [10, 50, 100],
    'optimizer': ['SGD', 'Adam'],
    'dropout_rate': [0.0, 0.1, 0.2],
    'neurons': [1, 5, 10]
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

y_pred_test = grid_result.predict(X_test)
y_pred_train = grid_result.predict(X_train)

accuracy = accuracy_score(y_test, y_pred_test)

print("Sequential best params:", grid_result.best_params_)
print("Sequential accuracy:", accuracy)
print("Sequential best score:", grid_result.best_score_)
print("Classification Report for Test Data")
print(classification_report(y_test, y_pred_test))
print("Classification Report for Training Data")
print(classification_report(y_train, y_pred_train))

ValueError: Invalid parameter dropout_rate for estimator KerasClassifier.
This issue can likely be resolved by setting this parameter in the KerasClassifier constructor:
`KerasClassifier(dropout_rate=0.0)`
Check the list of available parameters with `estimator.get_params().keys()`