In [1]:
from __future__ import print_function
from distutils.version import LooseVersion as Version
import sys

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.9 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == min_ver:
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))    
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(sys.version)
if pyversion >= "3.9":
    print(OK, "Python version is %s" % sys.version)
elif pyversion < "3.9":
    print(FAIL, "Python version 3.9 is required,"
                " but %s is installed." % sys.version)
else:
    print(FAIL, "Unknown Python version: %s" % sys.version)

    
print()
requirements = {'numpy': "1.21.1", 'matplotlib': "3.4.2",'sklearn': "0.24.2", 
                'pandas': "1.3.1",'xgboost': "1.3.3", 'shap': "0.39.0"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.9.7 | packaged by conda-forge | (default, Sep  2 2021, 17:55:20) [MSC v.1916 64 bit (AMD64)]

[42m[ OK ][0m numpy version 1.21.1 is installed.
[42m[ OK ][0m matplotlib version 3.4.2 is installed.
[42m[ OK ][0m sklearn version 0.24.2 is installed.
[42m[ OK ][0m pandas version 1.3.1 is installed.
[42m[ OK ][0m xgboost version 1.3.3 is installed.
[42m[ OK ][0m shap version 0.39.0 is installed.


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from matplotlib import pylab as plt
%pylab inline
from matplotlib import rcParams
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from  sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ParameterGrid

Populating the interactive namespace from numpy and matplotlib


pylab import has clobbered these variables: ['plt']
`%matplotlib` prevents importing * from pylab and numpy


In [3]:
fontsize=14
rcParams['font.size']=fontsize
rcParams['font.family']='sans-serif'
rcParams['axes.labelsize']=fontsize
rcParams['axes.titlesize']=fontsize
rcParams['xtick.labelsize']=12
rcParams['ytick.labelsize']=fontsize
rcParams['legend.fontsize']=fontsize
rcParams['figure.figsize']=(10,7.5)
rcParams['axes.grid']=True

In [4]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('data/train_V2.csv')
df.sample(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
df.shape

In [None]:
match_df = df.groupby('matchId').count()['Id']

In [None]:
fig = plt.figure()
plt.violinplot(match_df)
plt.ylabel('number of players')
plt.xlabel('match')
plt.title('Number of Players in Each Match')
plt.show()
fig.savefig('figures/Number of Players in Each Match.jpg', dpi = 300)

In [None]:
fig,ax = plt.subplots(figsize=(16,12))
ax = sns.heatmap(df.corr(), annot=True)
fig.savefig('figures/Correlation.jpg', dpi = 300)

In [None]:
fig = plt.figure()
df['winPlacePerc'].plot.hist(bins=100)
plt.xlabel('win place percentage')
plt.ylabel('count')
plt.title('Distribution of Win Place')
plt.show()
fig.savefig('figures/Distribution of Win Place.jpg', dpi = 300)

In [None]:
fig = plt.figure()
df.plot.scatter('killPlace','winPlacePerc',s=1,alpha=0.1)
plt.ylabel('win place percentage')
plt.xlabel('kill place')
plt.title('Distribution of Win Place vs. Kill Place')
plt.show()
fig.savefig('figures/Distribution of Win Place vs. Kill Place.jpg', dpi = 300)

In [None]:
fig = plt.figure()
df['walkDistance'].plot.hist(bins=50, log=True)
plt.xlabel('walk distance')
plt.ylabel('count')
plt.title('Distribution of Walk Distance')
plt.show()
fig.savefig('figures/Distribution of Walk Distance.jpg', dpi = 300)

In [None]:
fig = plt.figure()
df['kills'].plot.hist(bins=df['kills'].nunique(), log=True)
plt.xlabel('kills')
plt.ylabel('count')
plt.title('Distribution of Kills')
plt.show()
fig.savefig('figures/Distribution of Kills.jpg', dpi = 300)

In [None]:
fig = plt.figure()
df[['winPlacePerc','matchType']].boxplot(by='matchType',figsize=(22,16.5))
plt.ylabel('win place percentage')
plt.xlabel('match type')
plt.title('Distribution of Win Place')
plt.show()
fig.savefig('figures/Distribution of Win Place by Match Type.jpg', dpi = 300)

In [None]:
fig = plt.figure()
pd.value_counts(df['matchType'],normalize=True).plot.bar()
plt.ylabel('fraction')
plt.xlabel('match type')
plt.title('Distribution of Match Type')
plt.show()
fig.savefig('figures/Distribution of Match Type.jpg', dpi = 300)

In [None]:
fig = plt.figure()
pd.plotting.scatter_matrix(df[['assists', 'boosts', 'killPlace', 'kills', 'winPlacePerc']], marker='o', s=1, alpha=0.1)
plt.show()
fig.savefig('figures/Scatter Matrix.jpg', dpi = 300)

In [None]:
n = len(pd.unique(df['Id']))
n

In [None]:
column_list = ['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills', 
               'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace', 'revives', 'rideDistance', 
               'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints', 
               'winPlacePerc']

In [None]:
ndf = df[column_list]
ndf.sample(10)

In [None]:
ndf.shape

In [None]:
sdf = ndf.sample(round(0.01*len(ndf)), random_state=11)
sdf.to_csv('data/sample.csv')

In [5]:
sdf = pd.read_csv('data/sample.csv')
sdf.head()

Unnamed: 0.1,Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,2069048,0,0,57.33,0,0,0,82,0,0,0,0.0,1382,squad,27,0,0.0,0,0.0,0,0,199.2,2,0,0.1538
1,25967,3,5,217.1,1,0,2,22,1231,1,1,141.8,2091,squad,26,0,2405.0,0,0.0,0,1,3711.0,5,1530,0.92
2,2896533,0,0,242.7,0,0,0,63,0,0,0,0.0,1361,squad-fpp,28,0,0.0,0,0.0,0,0,593.8,2,0,0.3704
3,3314281,0,0,100.0,1,0,0,31,0,1,1,13.73,1387,duo-fpp,48,0,0.0,0,11.3,0,0,1150.0,3,0,0.7234
4,2707511,0,0,63.21,0,0,0,81,0,0,0,0.0,1800,duo-fpp,49,0,0.0,0,0.0,0,0,94.5,2,0,0.1667


In [6]:
nsdf = sdf.loc[:, sdf.columns != 'Unnamed: 0']
nsdf.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,0,0,57.33,0,0,0,82,0,0,0,0.0,1382,squad,27,0,0.0,0,0.0,0,0,199.2,2,0,0.1538
1,3,5,217.1,1,0,2,22,1231,1,1,141.8,2091,squad,26,0,2405.0,0,0.0,0,1,3711.0,5,1530,0.92
2,0,0,242.7,0,0,0,63,0,0,0,0.0,1361,squad-fpp,28,0,0.0,0,0.0,0,0,593.8,2,0,0.3704
3,0,0,100.0,1,0,0,31,0,1,1,13.73,1387,duo-fpp,48,0,0.0,0,11.3,0,0,1150.0,3,0,0.7234
4,0,0,63.21,0,0,0,81,0,0,0,0.0,1800,duo-fpp,49,0,0.0,0,0.0,0,0,94.5,2,0,0.1667


In [7]:
len(nsdf)

44470

In [8]:
X = nsdf.loc[:, nsdf.columns != 'winPlacePerc']
y = nsdf['winPlacePerc']

onehot_ftrs = ['matchType']
minmax_ftrs = ['killPlace', 'maxPlace']
std_ftrs = ['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPoints', 'kills', 'killStreaks', 
            'longestKill', 'matchDuration', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 
            'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftrs),
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('std', StandardScaler(), std_ftrs)])

prep = Pipeline(steps=[('preprocessor', preprocessor)])

In [9]:
nr_states = 20
baseline_scores = np.zeros(nr_states)

for i in range(nr_states):
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 0.8,random_state=17*i)
    
    X_train_prep = prep.fit_transform(X_train)
    X_test_prep = prep.transform(X_test)
    
    y_baseline = np.zeros(len(y_test))
    for j in range(len(y_test)):
        y_baseline[j] = median(y_train)
    
    baseline_scores[i] = mean_absolute_error(y_test, y_baseline)

In [10]:
baseline_scores

array([0.26501269, 0.26663633, 0.26826089, 0.26715819, 0.26986714,
       0.26850037, 0.26622114, 0.26809875, 0.26789139, 0.26744017,
       0.26961851, 0.26684239, 0.27017157, 0.26992146, 0.26668542,
       0.26645011, 0.26664531, 0.26989961, 0.26537013, 0.26636761])

In [15]:
round(np.mean(baseline_scores), 3)

0.268

In [16]:
round(np.std(baseline_scores), 3)

0.002

In [17]:
nr_states = 20
test_scores = np.zeros(nr_states)

for i in range(nr_states):
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 0.8,random_state=17*i)
    
    X_train_prep = prep.fit_transform(X_train)
    X_test_prep = prep.transform(X_test)
    
    clf = RandomForestRegressor(n_estimators=600,max_depth=14,random_state=17*i)
    clf.fit(X_train_prep, y_train)
    
    y_test_pred = clf.predict(X_test_prep)
    y_test_pred = np.where(y_test_pred > 1.0, 1.0, y_test_pred)
    y_test_pred = np.where(y_test_pred < 0.0, 0.0, y_test_pred)
    test_scores[i] = mean_absolute_error(y_test, y_test_pred)

In [18]:
test_scores

array([0.06735226, 0.06546101, 0.06593442, 0.06642587, 0.06580476,
       0.06506561, 0.06533741, 0.06595017, 0.06704758, 0.06576422,
       0.0658573 , 0.06567997, 0.06545404, 0.06633509, 0.06522519,
       0.0652093 , 0.06564186, 0.06555016, 0.06582283, 0.06644428])

In [19]:
round(np.mean(test_scores), 3)

0.066

In [20]:
round(np.std(test_scores), 4)

0.0006

In [34]:
features = list(prep.named_steps['preprocessor'].transformers_[0][1].get_feature_names(onehot_ftrs))+minmax_ftrs+std_ftrs
pred_df = pd.DataFrame(X_test_prep, columns = features)
pred_df['true'] = list(y_test)
pred_df['pred'] = y_test_pred
pred_df.head()

Unnamed: 0,matchType_crashfpp,matchType_crashtpp,matchType_duo,matchType_duo-fpp,matchType_flarefpp,matchType_flaretpp,matchType_normal-duo-fpp,matchType_normal-solo-fpp,matchType_normal-squad,matchType_normal-squad-fpp,matchType_solo,matchType_solo-fpp,matchType_squad,matchType_squad-fpp,killPlace,maxPlace,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPoints,kills,killStreaks,longestKill,matchDuration,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,true,pred
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.69697,0.255102,-0.402246,-0.646003,-0.186635,0.286126,-0.362518,-0.51382,0.879563,-0.59526,-0.770012,-0.456577,1.457196,-0.356454,-0.405341,-0.045491,-0.154929,5.831737,11.088872,0.018756,1.798632,1.251398,0.3462,0.279884
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.27551,-0.402246,-0.646003,-0.105438,0.286126,1.192267,-0.51382,1.51554,0.039313,0.626926,1.92469,-0.805783,-0.356454,-0.32208,-0.045491,-0.154929,-0.142573,-0.083852,-0.50671,0.139726,1.148546,0.3214,0.414341
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.727273,0.265306,-0.402246,-0.067791,-0.557373,-0.576886,-0.362518,-0.140163,-0.799864,-0.59526,-0.770012,-0.456577,1.272148,-0.356454,-0.405341,-0.045491,-0.154929,-0.142573,-0.083852,-0.752,0.139726,-0.815124,0.2593,0.236705
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.343434,0.265306,-0.402246,-0.646003,0.331639,2.012152,-0.362518,-0.51382,-0.799864,0.039313,0.626926,2.910378,-1.148892,-0.356454,-0.405341,-0.045491,-0.154929,-0.142573,-0.083852,-0.519518,-0.275,-0.815124,0.2963,0.388867
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.171717,0.285714,1.281516,-0.646003,0.59711,1.149139,-0.362518,1.354464,-0.799864,0.673886,0.626926,-0.130227,1.017708,1.753212,2.921137,-0.045491,-0.154929,-0.142573,-0.083852,0.590062,0.969179,-0.815124,0.7241,0.732863


In [36]:
pred_df.to_csv('results/prediction.csv')