In [1]:
# Load necessary packages

import os
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
DATA_PATH = "data"

# Read data and present
train = pd.read_csv(os.path.join(DATA_PATH, 'trainingData.csv'))
valid = pd.read_csv(os.path.join(DATA_PATH, 'validationData.csv'))
train.head()

Unnamed: 0,deck,nofGames,nOfPlayers,winRate
0,archers;arrows;baby-dragon;balloon;bats;fireba...,44,2,0.58511
1,archers;arrows;baby-dragon;balloon;bomber;free...,143,3,0.53767
2,archers;arrows;baby-dragon;balloon;bomber;gian...,61,3,0.34375
3,archers;arrows;baby-dragon;balloon;cannon;free...,162,1,0.49394
4,archers;arrows;baby-dragon;balloon;electro-wiz...,57,1,0.55833


In [3]:
valid.head()

Unnamed: 0,deck,nofGames,nOfPlayers,winRate
0,archers;arrows;baby-dragon;bandit;elixir-colle...,130,32,0.48496
1,archers;arrows;baby-dragon;elixir-collector;go...,495,75,0.47289
2,archers;arrows;baby-dragon;golem;lightning;meg...,271,25,0.52372
3,archers;arrows;baby-dragon;golem;lightning;meg...,125,20,0.58594
4,archers;arrows;balloon;barbarians;fireball;gia...,130,23,0.5


In [4]:
# Helper functions to preprocess data to bag-of-cards format

def unnest(df, col):
    unnested = (df.apply(lambda x: pd.Series(x[col]), axis=1)
                .stack()
                .reset_index(level=1, drop=True))
    unnested.name = col
    return df.drop(col, axis=1).join(unnested)

def to_bag_of_cards(df):
    df['ind'] = np.arange(df.shape[0]) + 1
    df_orig = df.copy()
    df['deck'] = df['deck'].apply(lambda d: d.split(';'))
    df = unnest(df, 'deck')
    df['value'] = 1
    df_bag = df.pivot(index='ind', columns='deck', values='value')
    df_bag[df_bag.isna()] = 0
    df_bag = df_bag.astype('int')
    return pd.concat([df_orig.set_index('ind'), df_bag], axis=1)

In [None]:
train = to_bag_of_cards(train)
valid = to_bag_of_cards(valid)
train.head()

In [None]:
valid.head()

In [None]:
# Sort data by number of games played

train = train.sort_values('nofGames', ascending=False)
valid = valid.sort_values('nofGames', ascending=False)

In [None]:
# Specify example model fitting function and R squared metric

from sklearn.svm import SVR

def R2_1(x, y):
    return 1 - np.sum(np.square(x - y)) / np.sum(np.square(y - np.mean(y)))

def R2_2(x, y):
    return np.sum(np.square(x - np.mean(y))) / np.sum(np.square(y - np.mean(y)))

def fit_svm(data):
    svr = SVR(kernel='rbf', gamma=1.0/90, C=1.0, epsilon=0.02, shrinking=False)
    svr.fit(data.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), data['winRate'])
    return svr

sizes = (np.arange(10) + 6) * 100

In [None]:
# Fit and predict on models of various training sizes

model_list = list(map(lambda size: fit_svm(train.iloc[1:size]), sizes))
pred_list = list(map(lambda model: model.predict(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1)),
                     model_list))

In [None]:
# Calculate R squared scores

r2 = list(map(lambda p: R2_2(p, valid['winRate']), pred_list))
r2

In [None]:
# Calculate R squared scores

r2 = list(map(lambda model: model.score(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), valid['winRate']), model_list))
r2

In [None]:
plt.plot(sizes, r2);

In [None]:
np.mean(r2)

In [None]:
# Save hyperparameteres and selected indices in submission format

with open('example_sub_python.txt', 'a') as f:
    for size in sizes:
        ind_text = ','.join(list(map(str, train.index.values[:size])))
        text = ';'.join(['0.02', '1.0', str(1.0 / 90), ind_text])
        f.write(text + '\n')