In [1]:
import numpy as np
import pandas as pd
import math
import itertools
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
df2017A = pd.read_csv('data/A_data_2017.csv')
df2018A = pd.read_csv('data/A_data_2018.csv')
df2019A = pd.read_csv('data/A_data_2019.csv')
df2020A = pd.read_csv('data/A_data_2020.csv')
df2021A = pd.read_csv('data/A_data_2021.csv')
df2022A = pd.read_csv('data/A_data_2022.csv')

df2017A = df2017A.dropna(axis=1, how='all') 
df2017A = df2017A.dropna(axis=0, how='any') 
df2018A = df2018A.dropna(axis=1, how='all') 
df2018A = df2018A.dropna(axis=0, how='any') 
df2019A = df2019A.dropna(axis=1, how='all') 
df2019A = df2019A.dropna(axis=0, how='any') 
df2020A = df2020A.dropna(axis=1, how='all') 
df2020A = df2020A.dropna(axis=0, how='any') 
df2021A = df2021A.dropna(axis=1, how='all') 
df2021A = df2021A.dropna(axis=0, how='any') 
df2022A = df2022A.dropna(axis=1, how='all') 
df2022A = df2022A.dropna(axis=0, how='any') 

x2017A = df2017A.drop(['teamname1', 'teamname2', 'result', 'towers1', 'towers2'], axis=1)
y2017A = df2017A['result'].values
x2018A = df2018A.drop(['teamname1', 'teamname2', 'result', 'towers1', 'towers2'], axis=1)
y2018A = df2018A['result'].values
x2019A = df2019A.drop(['teamname1', 'teamname2', 'result', 'towers1', 'towers2'], axis=1)
y2019A = df2019A['result'].values
x2020A = df2020A.drop(['teamname1', 'teamname2', 'result', 'towers1', 'towers2'], axis=1)
y2020A = df2020A['result'].values
x2021A = df2021A.drop(['teamname1', 'teamname2', 'result', 'towers1', 'towers2'], axis=1)
y2021A = df2021A['result'].values
x2022A = df2022A.drop(['teamname1', 'teamname2', 'result', 'towers1', 'towers2'], axis=1)
y2022A = df2022A['result'].values

In [3]:
X_trainA, X_testA, y_trainA, y_testA = train_test_split(x2018A, y2018A, test_size=0.2, random_state=1)

forest = RandomForestRegressor(n_estimators=1000, criterion='squared_error', random_state=1, n_jobs=-1)
forest.fit(X_trainA, y_trainA)

y_trainA_pred = forest.predict(X_trainA)
y_testA_pred = forest.predict(X_testA)

print('R^2 trainA: %.2f, testA: %.2f' % (r2_score(y_trainA, y_trainA_pred),
                                       r2_score(y_testA, y_testA_pred)))

R^2 trainA: 0.99, testA: 0.96


In [4]:
y_2017A_pred = forest.predict(x2017A)
y_2018A_pred = forest.predict(x2018A)
y_2019A_pred = forest.predict(x2019A)
y_2020A_pred = forest.predict(x2020A)

print('R^2 2017A: %.2f' % (r2_score(y2017A, y_2017A_pred)))
print('R^2 2018A: %.2f' % (r2_score(y2018A, y_2018A_pred)))
print('R^2 2019A: %.2f' % (r2_score(y2019A, y_2019A_pred)))
print('R^2 2020A: %.2f' % (r2_score(y2020A, y_2020A_pred)))

R^2 2017A: 0.99
R^2 2018A: 0.99
R^2 2019A: 0.98
R^2 2020A: 0.97


In [5]:
df2017B = pd.read_csv('data/B_data_2017.csv')
df2018B = pd.read_csv('data/B_data_2018.csv')
df2019B = pd.read_csv('data/B_data_2019.csv')
df2020B = pd.read_csv('data/B_data_2020.csv')
df2021B = pd.read_csv('data/B_data_2021.csv')
df2022B = pd.read_csv('data/B_data_2022.csv')

df2017B = df2017B.drop(['date', 'team1', 'team2', 'visionscorepg1', 'visionscorepg2'], axis=1)
df2018B = df2018B.drop(['date', 'team1', 'team2', 'visionscorepg1', 'visionscorepg2'], axis=1)
df2019B = df2019B.drop(['date', 'team1', 'team2', 'visionscorepg1', 'visionscorepg2'], axis=1)
df2020B = df2020B.drop(['date', 'team1', 'team2', 'visionscorepg1', 'visionscorepg2'], axis=1)
df2021B = df2021B.drop(['date', 'team1', 'team2', 'visionscorepg1', 'visionscorepg2'], axis=1)
df2022B = df2022B.drop(['date', 'team1', 'team2', 'visionscorepg1', 'visionscorepg2'], axis=1)

df2017B = df2017B[df2017B.apply(np.sum,axis=1)>1]
df2018B = df2018B[df2018B.apply(np.sum,axis=1)>1]
df2019B = df2019B[df2019B.apply(np.sum,axis=1)>1]
df2020B = df2020B[df2020B.apply(np.sum,axis=1)>1]
df2021B = df2021B[df2021B.apply(np.sum,axis=1)>1]
df2022B = df2022B[df2022B.apply(np.sum,axis=1)>1]

x2017B = df2017B.drop(['result'], axis=1)
y2017B = df2017B['result'].values
x2018B = df2018B.drop(['result'], axis=1)
y2018B = df2018B['result'].values
x2019B = df2019B.drop(['result'], axis=1)
y2019B = df2019B['result'].values
x2020B = df2020B.drop(['result'], axis=1)
y2020B = df2020B['result'].values
x2021B = df2021B.drop(['result'], axis=1)
y2021B = df2021B['result'].values
x2022B = df2022B.drop(['result'], axis=1)
y2022B = df2022B['result'].values

In [6]:
X_trainB, X_testB, y_trainB, y_testB = train_test_split(x2020B, y2020B, test_size=0.2, random_state=42)

sc = StandardScaler()
sc.fit(X_trainB)
X_trainB_std = sc.transform(X_trainB)
X_testB_std = sc.transform(X_testB)
x2017B_std = sc.transform(x2017B)
x2018B_std = sc.transform(x2018B)
x2019B_std = sc.transform(x2019B)
x2020B_std = sc.transform(x2020B)
x2021B_std = sc.transform(x2021B)
x2022B_std = sc.transform(x2022B)

In [12]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0)
bag = BaggingClassifier(base_estimator=tree, n_estimators=500, 
                        max_samples=0.8, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)
# Bagging
bag = bag.fit(X_trainB_std, y_trainB)
y_train_pred = bag.predict(X_trainB_std)
y_test_pred = bag.predict(X_testB_std)

bag_train = accuracy_score(y_trainB, y_train_pred) 
bag_test = accuracy_score(y_testB, y_test_pred) 
print('[Bagging]\naccuracy-train = %.3f, accuracy-test = %.3f' % (bag_train, bag_test))

[Bagging]
accuracy-train = 1.000, accuracy-test = 0.623


In [13]:
y2017B_pred = bag.predict(x2017B_std)
y2018B_pred = bag.predict(x2018B_std)
y2019B_pred = bag.predict(x2019B_std)
y2020B_pred = bag.predict(x2020B_std)
y2021B_pred = bag.predict(x2021B_std)
y2022B_pred = bag.predict(x2022B_std)

bagging2017 = accuracy_score(y2017B, y2017B_pred) 
bagging2018 = accuracy_score(y2018B, y2018B_pred) 
bagging2019 = accuracy_score(y2019B, y2019B_pred) 
bagging2020 = accuracy_score(y2020B, y2020B_pred) 
bagging2021 = accuracy_score(y2021B, y2021B_pred) 
bagging2022 = accuracy_score(y2022B, y2022B_pred) 

print('[Bagging]')
print('accuracy-2017 = %.3f' % (bagging2017))
print('accuracy-2018 = %.3f' % (bagging2018))
print('accuracy-2019 = %.3f' % (bagging2019))
print('accuracy-2020 = %.3f' % (bagging2020))
print('accuracy-2021 = %.3f' % (bagging2021))
print('accuracy-2022 = %.3f' % (bagging2022))

[Bagging]
accuracy-2017 = 0.926
accuracy-2018 = 0.925
accuracy-2019 = 0.925
accuracy-2020 = 0.925
accuracy-2021 = 0.835
accuracy-2022 = 0.787


In [14]:
ada_pipeline = Pipeline(steps = [("clf", AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))])

param_grid = {
    'clf__base_estimator__max_depth': [4, 6, 8, 10],
    'clf__n_estimators': [1, 5, 10, 20, 30, 50, 100]
}
ada_grid = GridSearchCV(
    estimator = ada_pipeline, 
    param_grid = param_grid, 
    scoring = "accuracy",
    n_jobs = 2,
    cv = 5
)
ada_grid.fit(X_trainB_std, y_trainB)

y_train_pred = ada_grid.predict(X_trainB_std)
y_test_pred = ada_grid.predict(X_testB_std)

ada_train = accuracy_score(y_trainB, y_train_pred) 
ada_test = accuracy_score(y_testB, y_test_pred) 
print('[Adaboost -- max_depth = %d]\naccuracy-train = %.3f, accuracy-test = %.3f' % (ada_grid.best_params_.get('clf__base_estimator__max_depth'), ada_train, ada_test))

[Adaboost -- max_depth = 4]
accuracy-train = 0.627, accuracy-test = 0.621


In [15]:
y2017B_pred = ada_grid.predict(x2017B_std)
y2018B_pred = ada_grid.predict(x2018B_std)
y2019B_pred = ada_grid.predict(x2019B_std)
y2020B_pred = ada_grid.predict(x2020B_std)
y2021B_pred = ada_grid.predict(x2021B_std)
y2022B_pred = ada_grid.predict(x2022B_std)

boosting2017 = accuracy_score(y2017B, y2017B_pred) 
boosting2018 = accuracy_score(y2018B, y2018B_pred) 
boosting2019 = accuracy_score(y2019B, y2019B_pred) 
boosting2020 = accuracy_score(y2020B, y2020B_pred) 
boosting2021 = accuracy_score(y2021B, y2021B_pred) 
boosting2022 = accuracy_score(y2022B, y2022B_pred) 

print('[Boosting]')
print('accuracy-2017 = %.3f' % (boosting2017))
print('accuracy-2018 = %.3f' % (boosting2018))
print('accuracy-2019 = %.3f' % (boosting2019))
print('accuracy-2020 = %.3f' % (boosting2020))
print('accuracy-2021 = %.3f' % (boosting2021))
print('accuracy-2022 = %.3f' % (boosting2022))

[Boosting]
accuracy-2017 = 0.627
accuracy-2018 = 0.629
accuracy-2019 = 0.626
accuracy-2020 = 0.626
accuracy-2021 = 0.622
accuracy-2022 = 0.620
