In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
try:
    df = pd.read_csv('../data/user_data.csv')
except FileNotFoundError as fnfError:
    print(fnfError)

In [3]:
df

Unnamed: 0,spotted,battles_on_stunning_vehicles,hits,battle_avg_xp,draws,max_xp,survived_battles,wins,losses,capture_points,...,max_frags,shots,frags,max_damage,xp,avg_damage_assisted,piercings,user_id,nickname,wn8
0,807,38,3564,263,15,1116,174,398,396,522,...,5,7319,389,1567,212897,108.00,2126,630675552,greenbirdus,56779
1,1530,3,9382,234,26,1239,611,1014,1029,3847,...,6,19333,1233,2012,483650,111.85,6362,596561562,C_ABTOBA3A,53066
2,128,0,1500,147,2,726,130,265,254,446,...,5,4138,157,1036,76622,42.06,883,634554120,Babay19910403,11749
3,14488,105,56828,382,118,2160,1985,5282,5098,8549,...,10,96897,7643,6255,4005621,297.38,35147,502086999,2803nevs,113425
4,498,0,4965,408,3,2181,202,321,197,348,...,10,8304,895,2232,212310,80.20,3878,534287717,ce3peo,217230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2172,813,1,3940,250,8,1269,207,529,568,357,...,7,6900,641,1958,276665,133.62,2914,634229130,xeno2_77095897,48250
2173,3627,0,19750,272,31,2067,830,1784,1955,5536,...,7,38619,2159,3721,1023769,181.33,11021,511024297,Lotr144,59699
2174,754,42,5213,279,22,1238,244,638,752,508,...,4,10519,632,2441,393485,122.60,3377,559642399,rozpieracz_zbieracz_pl,50049
2175,10667,0,93332,307,69,1510,3041,6226,5921,9428,...,8,173847,9041,3848,3752134,152.43,49361,534037002,vikingogo,87173


In [4]:
def addFeatures(df):
    df['win_ratio'] = df['wins']/df['battles']
    df['lose_ratio'] = df['losses']/df['battles']
    df['accuracy'] = df['hits']/df['shots']
    df['efficiency'] = df['piercings']/df['hits']
    df['frags_per_battle'] = df['frags']/df['battles']
    df['cap_points_per_battle'] = df['capture_points']/df['battles']
    df['damage_dealt_per_battle'] = df['damage_dealt']/df['battles']
    df['survival_rate'] = df['survived_battles']/df['battles']
    df['spots_per_battle'] = df['spotted']/df['battles']
    
    return df

df = addFeatures(df)
df.columns

Index(['spotted', 'battles_on_stunning_vehicles', 'hits', 'battle_avg_xp',
       'draws', 'max_xp', 'survived_battles', 'wins', 'losses',
       'capture_points', 'battles', 'damage_dealt', 'damage_received',
       'max_frags', 'shots', 'frags', 'max_damage', 'xp',
       'avg_damage_assisted', 'piercings', 'user_id', 'nickname', 'wn8',
       'win_ratio', 'lose_ratio', 'accuracy', 'efficiency', 'frags_per_battle',
       'cap_points_per_battle', 'damage_dealt_per_battle', 'survival_rate',
       'spots_per_battle'],
      dtype='object')

In [5]:
df

Unnamed: 0,spotted,battles_on_stunning_vehicles,hits,battle_avg_xp,draws,max_xp,survived_battles,wins,losses,capture_points,...,wn8,win_ratio,lose_ratio,accuracy,efficiency,frags_per_battle,cap_points_per_battle,damage_dealt_per_battle,survival_rate,spots_per_battle
0,807,38,3564,263,15,1116,174,398,396,522,...,56779,0.491965,0.489493,0.486952,0.596521,0.480841,0.645241,233.273177,0.215080,0.997528
1,1530,3,9382,234,26,1239,611,1014,1029,3847,...,53066,0.490092,0.497342,0.485284,0.678107,0.595940,1.859352,252.511358,0.295312,0.739488
2,128,0,1500,147,2,726,130,265,254,446,...,11749,0.508637,0.487524,0.362494,0.588667,0.301344,0.856046,99.529750,0.249520,0.245681
3,14488,105,56828,382,118,2160,1985,5282,5098,8549,...,113425,0.503143,0.485616,0.586478,0.618480,0.728043,0.814346,676.558678,0.189084,1.380072
4,498,0,4965,408,3,2181,202,321,197,348,...,217230,0.616123,0.378119,0.597905,0.781067,1.717850,0.667946,449.168906,0.387716,0.955854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2172,813,1,3940,250,8,1269,207,529,568,357,...,48250,0.478733,0.514027,0.571014,0.739594,0.580090,0.323077,241.606335,0.187330,0.735747
2173,3627,0,19750,272,31,2067,830,1784,1955,5536,...,59699,0.473210,0.518568,0.511406,0.558025,0.572679,1.468435,363.927851,0.220159,0.962069
2174,754,42,5213,279,22,1238,244,638,752,508,...,50049,0.451841,0.532578,0.495579,0.647804,0.447592,0.359773,366.162181,0.172805,0.533994
2175,10667,0,93332,307,69,1510,3041,6226,5921,9428,...,87173,0.509659,0.484692,0.536863,0.528875,0.740095,0.771775,417.754257,0.248936,0.873199


In [6]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2)

In [7]:
def dropIrrelevantCols(df, cols):
    return df.drop(columns=cols)
df_train = dropIrrelevantCols(df_train, ['user_id', 'nickname', 'battles_on_stunning_vehicles', 'hits', 'draws', 'wins', 'losses', 'capture_points', 'piercings', 'shots', 'max_frags', 'max_damage', 'max_xp', 'spotted', 'damage_dealt', 'damage_received'])
df_test = dropIrrelevantCols(df_test, ['user_id', 'nickname', 'battles_on_stunning_vehicles', 'hits', 'draws', 'wins', 'losses', 'capture_points', 'piercings', 'shots', 'max_frags', 'max_xp', 'max_damage', 'spotted', 'damage_dealt', 'damage_received'])

In [8]:
def convertColsToFloat(df, cols):
    for column in cols:
        try:
            df[column] = df[column].astype(float)
        except:
            df[column] = df[column].str.replace(',', '.')
            df[column] = df[column].astype(float)
            
    return df
df_train = convertColsToFloat(df_train, list(df_train.columns))
df_test = convertColsToFloat(df_test, list(df_test.columns))

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.compose import make_column_transformer

X_train, y_train = df_train.drop(columns=['wn8']), df_train['wn8']
X_test, y_test = df_test.drop(columns=['wn8']), df_test['wn8']
numericalAttribs = X_train.select_dtypes(np.number)
numericalAttribsNames = list(numericalAttribs.columns)
numericalPipeline = make_pipeline(SimpleImputer(strategy='median'),
                                  StandardScaler())

preprocessing = make_column_transformer((numericalPipeline, numericalAttribsNames))

In [10]:
df_train.columns

Index(['battle_avg_xp', 'survived_battles', 'battles', 'frags', 'xp',
       'avg_damage_assisted', 'wn8', 'win_ratio', 'lose_ratio', 'accuracy',
       'efficiency', 'frags_per_battle', 'cap_points_per_battle',
       'damage_dealt_per_battle', 'survival_rate', 'spots_per_battle'],
      dtype='object')

In [11]:
numericalAttribsNames

['battle_avg_xp',
 'survived_battles',
 'battles',
 'frags',
 'xp',
 'avg_damage_assisted',
 'win_ratio',
 'lose_ratio',
 'accuracy',
 'efficiency',
 'frags_per_battle',
 'cap_points_per_battle',
 'damage_dealt_per_battle',
 'survival_rate',
 'spots_per_battle']

In [12]:
from sklearn.metrics import mean_absolute_error, max_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, ElasticNet, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

def trainAndScore(models, X_train, X_test, y_train, y_test):
    modelsScore = {}
    for name, model in models.items():
        metricsScore = {}
        model.fit(X_train, y_train)
        
        meanAbsErr = abs(cross_val_score(model, X_test, y_test, scoring='neg_mean_absolute_error', cv=30)).mean()
        maxErr = abs(cross_val_score(model, X_test, y_test, scoring='max_error', cv=30)).mean()
        metricsScore.update({'Mean Absolute Error':meanAbsErr})
        metricsScore.update({'Max Error':maxErr})
        
        modelsScore.update({name:metricsScore})
    
    return modelsScore

def applyPreprocessing(model, preprocessing):
    modelWithPreprocessing = make_pipeline(preprocessing,
                                           model)
    return modelWithPreprocessing

candidateModels = {'Lasso':applyPreprocessing(Lasso(), preprocessing),
                   'Liner Regression':applyPreprocessing(LinearRegression(), preprocessing),
                   'Elastic Net':applyPreprocessing(ElasticNet(), preprocessing),
                   'Random Forest':applyPreprocessing(RandomForestRegressor(), preprocessing),
                   'Ridge':applyPreprocessing(Ridge(), preprocessing),
                   'SVR (Linear)':applyPreprocessing(SVR(kernel='linear'), preprocessing),
                   'SVR (rbf)':applyPreprocessing(SVR(kernel='rbf'), preprocessing)}

scores = trainAndScore(candidateModels, X_train, X_test, y_train, y_test)

In [13]:
scores

{'Lasso': {'Mean Absolute Error': 62.45702463076472,
  'Max Error': 198.91820539420434},
 'Liner Regression': {'Mean Absolute Error': 62.07101954383441,
  'Max Error': 196.37168585287105},
 'Elastic Net': {'Mean Absolute Error': 87.75630821730405,
  'Max Error': 270.4535688997419},
 'Random Forest': {'Mean Absolute Error': 63.858958650793646,
  'Max Error': 265.7529766666669},
 'Ridge': {'Mean Absolute Error': 62.13079887565839,
  'Max Error': 196.55776262408574},
 'SVR (Linear)': {'Mean Absolute Error': 82.91311141471543,
  'Max Error': 260.17979062520845},
 'SVR (rbf)': {'Mean Absolute Error': 367.7435947793473,
  'Max Error': 1173.43232056561}}

In [14]:
from sklearn.model_selection import GridSearchCV
randomForestGrid = {'n_estimators':[100, 300, 500, 1000],
                    'criterion':['absolute_error'],
                    'max_depth':[None, 8, 10, 12, 15],
                    'min_samples_split':[2, 5, 10]}
ridgeGrid = {'alpha':[0, 0.1, 0.5, 1.0, 2, 5, 10],
             'max_iter':[1000, 5000, 10000, 15000, 20000]}

randomForestModel = GridSearchCV(RandomForestRegressor(),
                                 param_grid=randomForestGrid,
                                 cv=5,
                                 verbose=True)
ridgeModel = GridSearchCV(Ridge(),
                          param_grid=ridgeGrid,
                          cv=5,
                          verbose=True)
randomForestModel.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
