In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import lightgbm as lgbm
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
import gc

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
def means_for_teams(data):
    g_data = data.groupby(['groupId']).mean()
    print(g_data.shape)

    for i, groupId in zip(data.index, data['groupId']):
        data.iloc[i][g_data.columns] = g_data.loc[groupId]
    return data

# features for duo or squad mode, these features are valued 0 in solo mode games
def drop_team_features(data):
    team_features = ['assists', 'DBNOs', 'revives', 'teamkills']
    return data.drop(team_features, axis=1)

def xy(data):
    X = data.drop('winPlacePerc', axis = 1).select_dtypes(['number'])
    y = data['winPlacePerc']
    return X, y

def fill_rankPoints(input_data, model = LinearRegression(), dropWinKillPoints=True):
    work_data = input_data.copy()
    work_data = pd.DataFrame(work_data.select_dtypes(['number']))

    #drop the other point features, since these will not be useful in predicting rankPoints
    work_data.drop(['winPoints', 'killPoints'], axis=1, inplace=True)

    #train the model with data where there are rankPoints
    train = pd.DataFrame(work_data.loc[data['rankPoints'] > 0, :])
    X_train = train.drop('rankPoints', axis=1)
    y_train = train['rankPoints']
    model.fit(X_train, y_train)

    #use model to predict missing rankPoints
    use = pd.DataFrame(work_data.loc[data['rankPoints'] <= 0, :])
    X_use = use.drop('rankPoints', axis=1)
    y_use = model.predict(X_use)

    #fill in the missing data
    work_data.loc[work_data['rankPoints'] <= 0, 'rankPoints'] = y_use

    return work_data


def print_top_feature_correlations_to_target_by_matchType(data, nrows):
    matchTypes = data['matchType'].value_counts().index.tolist()
    matchCounts = data['matchType'].value_counts().values.tolist()

    print("-----------------------------------------------------")
    print("Highest correlations to target feature BY matchType")
    print("Number of games:", nrows, "\n")
    for cnt, mt in zip(matchCounts, matchTypes):
        # for each matchtype
        # look at all the rows for that matchtype
        # and build a correlation matrix
        corr = data \
            .drop('winPlacePerc', axis=1) \
            .loc[data['matchType'] == mt] \
            .corrwith(data.loc[data['matchType'] == mt]['winPlacePerc'])
        corr = corr.sort_values(ascending=False)
        print("Type:", mt)
        print("Nr of games:", cnt)
        print(corr.head())
        print()


def scoreSets(dfs, random_state=1):
    for i, df in enumerate(dfs):
        lr = LinearRegression()
        x, y = xy(df)
        xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=random_state)
        lr.fit(xtrain, ytrain)
        print("Score for set nr: ", i+1, lr.score(xtest, ytest))

def divide_by_matchType_and_aggregate_by_groupId(data):
    matchTypes = data['matchType'].value_counts().index.tolist()
    squadMatches = []
    duoMatches = []
    soloMatches = []
    otherMatches = []
    for match in matchTypes:
        if 'squad' in match or 'flare' in match:
            squadMatches.append(match)
        elif 'duo' in match or 'crash' in match:
            duoMatches.append(match)
        elif 'solo' in match:
            soloMatches.append(match)
        else:
            otherMatches.append(match)

    squadSet = pd.DataFrame(data.loc[data['matchType'].isin(squadMatches),:])
    duoSet = pd.DataFrame(data.loc[data['matchType'].isin(duoMatches),:])
    soloSet = pd.DataFrame(data.loc[data['matchType'].isin(soloMatches),:])

    squadMeanSet = pd.DataFrame(squadSet.groupby('groupId').mean(), dtype='float16')
    duoMeanSet = pd.DataFrame(duoSet.groupby('groupId').mean(), dtype='float16')

    squadSets = [squadSet, squadMeanSet]
    duoSets = [duoSet, duoMeanSet]

    #print("Returning [[squadSet, squadMeanSet], [duoSet, duoMeanSet], soloSet]")
    return [squadSets, duoSets, soloSet]


In [3]:
# assign dtypes to minimize memory usage
dtypes = {
    'Id': 'object',
    'groupId': 'object',
    'matchId': 'object',
    'assists': 'uint8',
    'boosts': 'uint8',
    'damageDealt': 'float16',
    'DBNOs': 'uint8',
    'headshotKills': 'uint8',
    'heals': 'uint8',
    'killPlace': 'uint8',
    'killPoints': 'uint16',
    'kills': 'uint8',
    'killStreaks': 'uint8',
    'longestKill': 'float16',
    'maxPlace': 'uint8',
    'numGroups': 'uint8',
    'revives': 'uint8',
    'rideDistance': 'float16',
    'roadKills': 'uint8',
    'swimDistance': 'float16',
    'teamKills': 'uint8',
    'vehicleDestroys': 'uint8',
    'walkDistance': 'float16',
    'weaponsAcquired': 'uint8',
    'winPoints': 'uint8',
    'winPlacePerc': 'float16'
}

In [61]:
train_data = pd.read_csv("train_V2.csv", dtype=dtypes)
test_data = pd.read_csv("test_V2.csv", dtype=dtypes)

print(train_data.loc[:, train_data.isnull().any()].columns)
print(test_data.loc[:, test_data.isnull().any()].columns)

train_data[train_data['winPlacePerc'].isnull()]
train_data = train_data.dropna(subset=['winPlacePerc'])
train_data[train_data['winPlacePerc'].isnull()]

Index(['winPlacePerc'], dtype='object')
Index([], dtype='object')


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc


In [5]:
# train_data - Training data with feature "winPlacePerc"
# test_data - Testing data without feature "winPlacePerc"
# models - models for solo, dua and squad prediction as list [model for solo, model for duo, model for squad]
# test_data_y - Testing data feature "winPlacePerc" values to find MSE, if not added, function returns only dataframe with predicted values

def modelMSE(train_data, test_data, models, test_data_y=None):

    train = train_data
    test = test_data
   
    trainSets = divide_by_matchType_and_aggregate_by_groupId(train)
    testSets = divide_by_matchType_and_aggregate_by_groupId(test)
    
    all_predictions = pd.DataFrame()
    for train, test, model in zip(trainSets, testSets, models):
        if len(train) == 2:
            trainset = train[0]
            meanTrainSet = train[1] #only numeric values, the index is 'groupId'
            testset = test[0]
            X_test = test[1]

            X_train = meanTrainSet.drop('winPlacePerc', axis=1)
            y_train = meanTrainSet.winPlacePerc

            model.fit(X_train, y_train)

            yhat = model.predict(X_test)

            #now I will rejoin the prediction with the original row index
            yhat = pd.DataFrame(yhat, index=X_test.index,columns=['prediction'])
            testset = testset.join(yhat, on='groupId')
            yhat = testset.drop(testset.columns.difference(['prediction']), axis=1)

        else:
            X_train = train.select_dtypes(['number']).drop('winPlacePerc', axis=1)
            y_train = train.winPlacePerc
            X_test = pd.DataFrame(test.select_dtypes(['number']))

            model.fit(X_train, y_train)
            yhat = model.predict(X_test)

            X_test['prediction'] = yhat
            yhat = pd.DataFrame(X_test['prediction'], index=X_test.index, dtype='float32')

        all_predictions = all_predictions.append(yhat)

    all_predictions.sort_index(inplace=True)

    

    if (test_data_y is None):
        return all_predictions, 0
    else:
        MSE = mean_squared_error(all_predictions,test_data_y)
        return all_predictions, MSE


### Linear Regression

In [73]:
#split the data
train_data_copy = train_data.copy()
train_data_copy.drop(['Id','groupId','matchId', 'matchType'],axis=1,inplace=True)
X=train_data_copy.drop(['winPlacePerc'],axis=1)
y=train_data_copy['winPlacePerc']

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10,random_state=420)


In [75]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)


LinearRegression()

In [76]:
predictions = linear_regression.predict(X_test)
predictions

array([0.11544423, 0.28815191, 0.66781507, ..., 0.69298808, 0.15335417,
       0.24849936])

In [77]:
mean_squared_error(y_test.values,predictions)

0.0161167674846259

### Linear Regression with placement for Solo, Duo and Squad games are found separately

In [87]:
solo_mod = LinearRegression()
duo_mod = LinearRegression()
squad_mod = LinearRegression()

models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [88]:
modelMSE(train_d, test_d, models, test_data_y)[1]

0.0114928745

###  Linear Regression Parameter Tuning 

In [150]:
params={'fit_intercept':[True,False],'normalize':[True,False]}

In [151]:
linear_regression = LinearRegression()
grid_search= GridSearchCV(linear_regression,params,cv=3,scoring='neg_mean_squared_error')

In [152]:
data = train_data.copy()
data.drop(['matchType','Id','groupId','matchId'],axis=1,inplace=True)
X=data.drop(['winPlacePerc'],axis=1)
y=data['winPlacePerc']

In [162]:
grid_search.fit(X[:1000000],y[:1000000])

GridSearchCV(cv=3, estimator=LinearRegression(),
             param_grid={'fit_intercept': [True, False],
                         'normalize': [True, False]},
             scoring='neg_mean_squared_error')

In [163]:
grid_search.best_params_

{'fit_intercept': True, 'normalize': False}

In [90]:
solo_mod = LinearRegression(fit_intercept = True, normalize = False, n_jobs = -1)
duo_mod = LinearRegression(fit_intercept = True, normalize = False, n_jobs = -1)
squad_mod = LinearRegression(fit_intercept = True, normalize = False, n_jobs = -1)

models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [91]:
modelMSE(train_d, test_d, models, test_data_y)[1]

0.0114928745

In [20]:
test_data_y

3500001    0.718750
3500002    0.208252
3500003    0.104187
3500004    0.911133
3500005    0.115417
             ...   
4446960    0.241455
4446961    0.178589
4446962    0.293457
4446963    0.481445
4446964    0.799805
Name: winPlacePerc, Length: 946964, dtype: float16

### Lasso

In [92]:
solo_mod = Lasso()
duo_mod = Lasso()
squad_mod = Lasso()

models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [93]:
modelMSE(train_d, test_d, models, test_data_y)[1]

0.020356419178780123

### Lasso Parameter Tuning

In [172]:
params={'alpha':[1,0.1,0.001,0.0001], 'normalize':[True,False]}

In [173]:
lasso_regression = Lasso()
grid_search= GridSearchCV(lasso_regression,params,cv=3,scoring='neg_mean_squared_error')

In [174]:
data = train_data.copy()
data.drop(['matchType','Id','groupId','matchId'],axis=1,inplace=True)
X=data.drop(['winPlacePerc'],axis=1)
y=data['winPlacePerc']

In [175]:
grid_search.fit(X[:1000000],y[:1000000])

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=3, estimator=Lasso(),
             param_grid={'alpha': [1, 0.1, 0.001, 0.0001],
                         'normalize': [True, False]},
             scoring='neg_mean_squared_error')

In [176]:
grid_search.best_params_

{'alpha': 0.0001, 'normalize': False}

In [94]:
solo_mod = Lasso(fit_intercept = True, normalize = False, alpha = 0.0001)
duo_mod = Lasso(fit_intercept = True, normalize = False, alpha = 0.0001)
squad_mod = Lasso(fit_intercept = True, normalize = False, alpha = 0.0001)

models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [95]:
modelMSE(train_d, test_d, models, test_data_y)[1]

0.011497869267820248

### Ridge

In [96]:
solo_mod = Ridge()
duo_mod = Ridge()
squad_mod = Ridge()

models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [97]:
modelMSE(train_d, test_d, models, test_data_y)[1]


0.011492588506364029

### Ridge Parameter Tuning

In [219]:
params={'alpha':[1,0.1,0.001,0.0001], 'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 'normalize':[True,False], 'tol':[0.1,0.01, 0.001]}

In [220]:
ridge_regression = Ridge()
grid_search= GridSearchCV(ridge_regression,params,cv=3,scoring='neg_mean_squared_error')

In [221]:
data = train_data.copy()
data.drop(['matchType','Id','groupId','matchId'],axis=1,inplace=True)
X=data.drop(['winPlacePerc'],axis=1)
y=data['winPlacePerc']

In [222]:
grid_search.fit(X[:1000000],y[:1000000])

GridSearchCV(cv=3, estimator=Ridge(),
             param_grid={'alpha': [1, 0.1, 0.001, 0.0001],
                         'normalize': [True, False], 'solver': ['auto', 'svd'],
                         'tol': [0.1, 0.01, 0.001]},
             scoring='neg_mean_squared_error')

In [223]:
grid_search.best_params_

{'alpha': 1, 'normalize': False, 'solver': 'auto', 'tol': 0.1}

In [98]:
solo_mod = Ridge(fit_intercept = True, normalize = False, alpha = 1, tol =  0.01, solver = 'auto')
duo_mod = Ridge(fit_intercept = True, normalize = False, alpha = 1, tol =  0.01, solver = 'auto')
squad_mod = Ridge(fit_intercept = True, normalize = False, alpha = 1, tol = 0.01, solver = 'auto')

models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [226]:
modelMSE(train_d, test_d, models, test_data_y)[1]

0.013325663065541424

### Random Forest Regressor

In [99]:
#solo_mod = RandomForestRegressor(n_estimators=10,  min_samples_split=2, min_samples_leaf = 1, max_features='sqrt', max_depth = None, bootstrap = False,  n_jobs=-1, verbose=1) #max_features=0.5,
#duo_mod = RandomForestRegressor(n_estimators=10,  min_samples_split=2, min_samples_leaf = 1, max_features='sqrt', max_depth = None, bootstrap = False,  n_jobs=-1, verbose=1)
#squad_mod = RandomForestRegressor(n_estimators=10,  min_samples_split=2, min_samples_leaf = 1, max_features='sqrt', max_depth = None, bootstrap = False,  n_jobs=-1, verbose=1)

solo_mod  = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)
duo_mod  = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)
squad_mod = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)
 
models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [100]:
modelMSE(train_d, test_d, models, test_data_y)[1]

(         prediction
 2223482    0.894972
 2223483    0.628783
 2223484    0.694081
 2223485    0.940858
 2223486    0.072896
 ...             ...
 4446960    0.258654
 4446961    0.387337
 4446962    0.319424
 4446963    0.393011
 4446964    0.865053
 
 [2223482 rows x 1 columns],
 0.0060317956063010575)

### Random Forest Regressor Parameter Tuning

In [11]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 150, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt', '0.5']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [14]:
data = train_data.copy()
data.drop(['matchType','Id','groupId','matchId'],axis=1,inplace=True)
X=data.drop(['winPlacePerc'],axis=1)
y=data['winPlacePerc']

train_features = X[:100000]
train_labels = y[:100000]

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  47 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.4min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 14, 18, 23, 27, 32,
                                                      36, 41, 45, 50, None],
                                        'max_features': ['sqrt', '0.5'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 4, 6],
                                        'n_estimators': [1, 17, 34, 50, 67, 83,
                                                         100, 116, 133, 150]},
                   random_state=42, verbose=2)

In [15]:
rf_random.best_params_

#1000- {'n_estimators': 155,'min_samples_split': 2,'min_samples_leaf': 1,'max_features': 'auto','max_depth': 32,'bootstrap': True}

{'n_estimators': 150,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [102]:
solo_mod  = RandomForestRegressor(n_estimators=150, max_features='sqrt',min_samples_split = 4, min_samples_leaf=3, n_jobs=-1)
duo_mod  = RandomForestRegressor(n_estimators=150, max_features='sqrt', min_samples_split = 4, min_samples_leaf=3, n_jobs=-1)
squad_mod = RandomForestRegressor(n_estimators=150, max_features='sqrt',min_samples_split = 4, min_samples_leaf=3, n_jobs=-1)
 
models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [103]:
modelMSE(train_d, test_d, models, test_data_y)[1]

0.006475478893843346

### LightGBM

In [101]:
solo_mod = lgbm.LGBMRegressor(learning_rate = 0.05,objective = "mae",metric = "mae",num_leaves =  128,verbose =  1,random_state = 42,bagging_fraction =  0.7,feature_fraction =  0.7, n_estimators=100)
duo_mod = lgbm.LGBMRegressor(learning_rate = 0.05,objective = "mae",metric = "mae",num_leaves =  128,verbose =  1,random_state = 42,bagging_fraction =  0.7,feature_fraction =  0.7, n_estimators=100)
squad_mod = lgbm.LGBMRegressor(learning_rate = 0.05,objective = "mae",metric = "mae",num_leaves =  128,verbose =  1,random_state = 42,bagging_fraction =  0.7,feature_fraction =  0.7, n_estimators=100)

models = [solo_mod, duo_mod, squad_mod]

train_d = train_data.copy()[:2223482]
test_d = train_data.copy()[2223482:-1]

test_data_y = test_d.winPlacePerc
test_d = test_d.drop('winPlacePerc', axis=1)

In [31]:
modelMSE(train_d, test_d, models, test_data_y)[1]

(         prediction
 4000001    0.045943
 4000002    0.912115
 4000003    0.314285
 4000004    0.030677
 4000005    0.043063
 ...             ...
 4446960    0.384135
 4446961    0.244435
 4446962    0.319853
 4446963    0.401720
 4446964    0.926213
 
 [446964 rows x 1 columns],
 0.007930814029814782)

### LightGBM Parameter Tuning

In [None]:
#LinearRegression # Ridge # Lasso # RandomForestRegressor #
#lgbreg = lgb.LGBMRegressor(learning_rate = 0.05,objective = "mae",metric = "mae",num_leaves =  128,verbose =  1,random_state = 42,bagging_fraction =  0.7,feature_fraction =  0.7, n_estimators=100)
#solo_mod = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)
#duo_mod = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)
#squad_mod = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)