In [1]:
# inline plotting instead of popping out
%matplotlib inline

# python 3.6.8
import os, itertools, csv

from IPython.display import Image
from IPython.display import display

# numpy  1.19.5
import numpy as np

# pandas  0.25.3
import pandas as pd

# scikit-learn  0.22
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_moons
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import SVC

# matplotlib  3.1.2
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
plt = matplotlib.pyplot


# Make output directory
if not os.path.exists("output/") : os.mkdir("output/")

In [2]:
df = pd.read_csv('./dataC_2022.csv')

display(df.head(10))
df.shape

Unnamed: 0,teamname1,Aatrox1,Ahri1,Akali1,Alistar1,Amumu1,Anivia1,Annie1,Ashe1,Aurelion Sol1,...,firstblood,firstdragon,firstherald,firstbaron,firsttower,GDa10,XPDa10,GDa15,XPDa15,result
0,Fnatic Academy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-2265.0,-357.0,-2864.0,-2168.0,0
1,AlienTech eSports,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1778.0,877.0,4520.0,4241.0,1
2,unknown team,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,315.0,729.0,-984.0,-1094.0,1
3,Team Forge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,-505.0,-394.0,-3485.0,-3181.0,0
4,LDLC OL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1623.0,-517.0,-5963.0,-2883.0,0
5,Fnatic Academy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,175.0,-906.0,-3256.0,-2925.0,0
6,AlienTech eSports,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,-694.0,-620.0,-2981.0,-1089.0,0
7,ASUS ROG Army,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,-757.0,-305.0,-41.0,422.0,1
8,Team Kinguin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1667.0,-522.0,-1446.0,-43.0,0
9,EURONICS Gaming,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-2480.0,-91.0,-4367.0,-1490.0,0


(48913, 471)

In [3]:
X = df.drop(['teamname1', 'teamname2', 'result'], axis=1).values
Y = df['result'].values

display(X.shape)

(48913, 468)

In [4]:
display(df.isnull().sum())

print(df.shape)

# drop rows with missing values
df_drop_row = df.dropna()
print(df_drop_row.shape)

teamname1    0
Aatrox1      0
Ahri1        0
Akali1       0
Alistar1     0
            ..
GDa10        0
XPDa10       0
GDa15        0
XPDa15       0
result       0
Length: 471, dtype: int64

(48913, 471)
(48913, 471)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

pipe_knn = Pipeline([
    ("imr", SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')),
    ("scl", StandardScaler()),
    ("clf", KNeighborsClassifier(n_neighbors = 10, p = 2, metric = "minkowski"))
])

pipe_svm = Pipeline([
    ("imr", SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')),
    ("scl", StandardScaler()),
    ('clf', SVC(kernel = "rbf", random_state = 0, gamma = 0.001, C = 100.0))
])

# use the pipeline model to train
pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('[KNN]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
print('\n[SVC]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred)) 

[KNN]
Misclassified samples: 3051
Accuracy: 0.6881


In [None]:
df1 = df.drop(['teamname1', 'teamname2', 'result'], axis=1)

In [None]:
perm_importance = permutation_importance(pipe_svm, X_test, y_test)


sorted_idx = perm_importance.importances_mean.argsort()
plt.barh((df1.columns[sorted_idx])[458:468].tolist(), perm_importance.importances_mean[sorted_idx][458:468])
plt.xlabel("Permutation Importance")

In [None]:
# inline plotting instead of popping out
%matplotlib inline

# python 3.8.8
import os, itertools, csv

from IPython.display import Image
from IPython.display import display

# numpy  1.22.4
import numpy as np

# pandas  1.2.4
import pandas as pd

# scikit-learn  0.24.1
from sklearn import datasets
load_iris = datasets.load_iris
make_moons = datasets.make_moons
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error, roc_curve, auc
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# matplotlib  3.3.4
import matplotlib.pyplot as plt

# Make output directory
if not os.path.exists("output/") : os.mkdir("output/")
    
import warnings
warnings.filterwarnings("ignore")

In [None]:
pipe1 = Pipeline([['sc', StandardScaler()], ['clf', LogisticRegression(C = 10, random_state = 0, solver = "liblinear")]])
pipe2 = Pipeline([['clf', DecisionTreeClassifier(max_depth = None, random_state = 0)]])
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', KNeighborsClassifier(n_neighbors = 5)]])

clf_labels = ['LogisticRegression', 'DecisionTree', 'KNN']

best_vt, best_w, best_score = None, (), -1
for a, b, c in list(itertools.permutations(range(0,3))):
    clf = VotingClassifier(estimators=[('LogisticRegression', pipe1), ('DecisionTree', pipe2), ('KNN', pipe3)], 
                           voting='soft', weights=[a,b,c])
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10)
    print('%s: %.3f (+/- %.3f)' % ((a,b,c), scores.mean(), scores.std()))
    if best_score < scores.mean():
        best_vt, best_w, best_score = clf, (a, b, c), scores.mean()

In [None]:
print("[Voting]")
print('\nBest %s: %.3f' % (best_w, best_score))
voting = best_vt.fit(X_train, y_train)
y_train_pred = voting.predict(X_train)
y_test_pred = voting.predict(X_test)
voting_train = accuracy_score(y_train, y_train_pred) 
voting_test = accuracy_score(y_test, y_test_pred)
print("Train Acc:", voting_train)
print("Test  Acc:", voting_test)

In [None]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0)
bag = BaggingClassifier(base_estimator=tree, n_estimators=500, 
                        max_samples=0.7, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('[Bagging]')
print('accuracy-train = %.3f, accuracy-test = %.3f' % (bag_train, bag_test))

In [None]:
df1 = df.drop(['teamname1', 'teamname2', 'result'], axis=1)

In [None]:
feature_importances = np.mean([
    tree.feature_importances_ for tree in bag.estimators_
], axis=0)
sorted_idx = feature_importances.argsort()

plt.barh((df1.columns[sorted_idx])[458:468].tolist(), feature_importances[sorted_idx][458:468])

In [None]:
ada_pipeline = Pipeline(steps = [("clf", AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))])

params_grid = {
    'clf__base_estimator__max_depth': [5, 7, 9, 10, 20],
    'clf__n_estimators': [1, 10, 50, 100, 150, 200]
}

ada_grid = GridSearchCV(
    estimator = ada_pipeline, 
    param_grid = params_grid, 
    scoring = "accuracy",
    n_jobs = 2,
    cv = 5
)

ada_grid.fit(X_train, y_train)

y_train_pred = ada_grid.predict(X_train)
y_test_pred = ada_grid.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print("[Adaboost_BaseTree_MaxDepth=Best]")
print("Train Acc:", ada_train)
print("Test  Acc:", ada_test)
print(ada_grid.best_params_)