In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
%cd /content/drive/MyDrive/RPI/Intro to ML/PUBG_Placement_Prediction/
%ls

/content/drive/MyDrive/RPI/Intro to ML/PUBG_Placement_Prediction
 Final_Project_Report_HaolinXiong_PUBG.gdoc
 final_testing_data.csv
 final_training_data.csv
'PlayerUnknown’s Battle Ground  Placement Prediction.gslides'
 Project_Report-1_HaolinXiong_PUBG.gdoc
'Pubg Game Prediction.txt'
 test_V2.csv
 train_sample.csv
 train_V2.csv


# Loading Dataset

In [51]:
df = pd.read_csv('final_training_data.csv')

In [52]:
cor = df.corr()

In [53]:
#rankPoints are deprecated so dropped
df.drop('rankPoints',axis=1,inplace=True)

In [54]:
df.shape

(3334272, 29)

**Simple Linear Regression**

In [55]:
df.columns

Index(['Unnamed: 0', 'Id', 'groupId', 'matchId', 'assists', 'boosts',
       'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace',
       'killPoints', 'kills', 'killStreaks', 'longestKill', 'matchDuration',
       'matchType', 'maxPlace', 'numGroups', 'revives', 'rideDistance',
       'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys',
       'walkDistance', 'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')

In [56]:
abs(cor['winPlacePerc']).sort_values(ascending=False)

winPlacePerc       1.000000
walkDistance       0.810636
killPlace          0.718968
boosts             0.634322
weaponsAcquired    0.581914
damageDealt        0.439718
heals              0.427799
kills              0.419019
longestKill        0.409532
killStreaks        0.377322
rideDistance       0.343153
assists            0.298705
DBNOs              0.279267
headshotKills      0.277717
revives            0.240311
swimDistance       0.149434
vehicleDestroys    0.073312
numGroups          0.039583
maxPlace           0.037451
roadKills          0.034842
teamKills          0.015818
killPoints         0.013309
rankPoints         0.013111
winPoints          0.007471
matchDuration      0.005017
Unnamed: 0         0.000159
Name: winPlacePerc, dtype: float64

In [57]:
corr_list = list(abs(cor['winPlacePerc'].drop('winPlacePerc',axis=0)).sort_values(ascending=False).keys())

In [58]:
corr_list

['walkDistance',
 'killPlace',
 'boosts',
 'weaponsAcquired',
 'damageDealt',
 'heals',
 'kills',
 'longestKill',
 'killStreaks',
 'rideDistance',
 'assists',
 'DBNOs',
 'headshotKills',
 'revives',
 'swimDistance',
 'vehicleDestroys',
 'numGroups',
 'maxPlace',
 'roadKills',
 'teamKills',
 'killPoints',
 'rankPoints',
 'winPoints',
 'matchDuration',
 'Unnamed: 0']

In [59]:
df['winPlacePerc'] = df['winPlacePerc'].fillna(0)

In [60]:
df.shape

(3334272, 29)

In [61]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# **Deal With Cheaters**

All teams with at least one player with >20 kills are removed

3334272 original

3333169 players left

In [62]:
df['kills'].describe()

count    3.334272e+06
mean     9.252910e-01
std      1.562194e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.000000e+00
max      7.200000e+01
Name: kills, dtype: float64

In [63]:
true_df = df.copy()

In [64]:
cheater_team = df.loc[df['kills']>20]

In [65]:
cheater_team

Unnamed: 0.1,Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
1907,1907,7711e1d39407da,a9fbc63469296f,45fc89f9b11eaf,1,4,2325.0,13,8,2,1,1142,21,2,192.10,1376,duo-fpp,49,46,1,2012.0,0,0.0,0,0,2697.0,5,1505,1.0000
28177,28177,44513e16340a63,c1e3f56e09b9a4,cf5dd782577ea9,1,2,2767.0,24,17,1,1,1355,22,3,670.90,1877,squad,27,26,0,2589.0,0,0.0,0,0,3290.0,7,1509,0.9615
30094,30094,8d8d78ba4e9b55,6634da8e3940de,8a728def0644be,3,0,1988.0,22,6,5,5,1000,22,3,103.10,1198,normal-squad-fpp,14,14,1,1668.0,0,0.0,0,0,1864.0,33,1500,1.0000
40129,40129,f7a740ffbc2c43,418f6120b2e193,f2614050e9046a,2,0,1906.0,0,4,19,2,0,22,3,124.00,998,normal-duo-fpp,7,7,0,0.0,0,0.0,0,0,116.7,16,0,1.0000
43197,43197,367599b150318a,0370e2f3be345b,5f2f9fcb9705af,6,0,2459.0,0,3,0,2,0,23,3,100.00,910,normal-duo-fpp,7,6,0,0.0,0,0.0,0,0,208.5,12,0,0.8333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3292152,3292152,310cdfbe701f03,02b0ccfda16b93,f755207f23cf88,2,4,2645.0,16,8,0,1,0,24,5,436.70,1319,squad,27,27,0,0.0,0,0.0,0,0,3180.0,7,0,1.0000
3298784,3298784,8800992f7dd530,87e52e1a6b0a73,1e573980b3ee3c,1,0,1999.0,0,1,8,5,1000,26,3,33.05,1559,normal-squad-fpp,7,7,0,0.0,0,0.0,0,0,111.8,59,1500,0.3333
3301038,3301038,f69c34cce4d46a,ea800ea1aa6efa,ed092b499ac27d,3,0,2817.0,0,6,2,3,1000,28,4,81.38,1059,normal-squad-fpp,6,6,0,0.0,0,0.0,0,0,1147.0,31,1500,0.4000
3314427,3314427,0a2721fcf1cdd9,1708576031cf96,fcd20a43cb81ea,6,0,3297.0,0,5,4,1,0,30,5,203.40,896,normal-solo-fpp,20,13,0,0.0,0,0.0,0,0,253.0,32,0,1.0000


In [66]:
#drop the cheaters' team
cheater_team_id = list(cheater_team['groupId'])
true_df['cheater'] = df['groupId'].apply(lambda x: x in cheater_team_id)
true_df.drop(true_df.loc[true_df['cheater']].index,inplace=True)
true_df.reset_index(inplace=True)

In [20]:
len(df)

3334272

In [21]:
len(true_df)

3333169

# Implement Random Forest, XGBoost, KNN, SVM

Train Test Split


In [67]:
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold

In [68]:
target_label = ['winPlacePerc']
feature_label = corr_list[0:9]
X_train = pd.DataFrame(true_df,columns= feature_label)
y_train = pd.DataFrame(true_df,columns= target_label)

In [24]:
X_train.shape, y_train.shape

((3333169, 9), (3333169, 1))

In [25]:
df_test = pd.read_csv('final_testing_data.csv')
df_test['winPlacePerc'] = df_test['winPlacePerc'].fillna(0)

In [26]:
X_test = pd.DataFrame(df_test,columns= feature_label)
y_test = pd.DataFrame(df_test,columns= target_label)

In [27]:
X_test.shape, y_test.shape

((1112694, 9), (1112694, 1))

In [31]:
cheater_team_test = df_test.loc[df_test['kills']>20]
cheater_team_id_test = list(cheater_team_test['groupId'])
df_test['cheater'] = df_test['groupId'].apply(lambda x: x in cheater_team_id_test)
cheater_index = np.array((df_test.loc[df_test['cheater']]).index)

Random Forest Regressor

In [32]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
def my_Random_Forest(X_train, X_test, y_train, y_test):
  from sklearn.ensemble import RandomForestRegressor
  forest = RandomForestRegressor(n_estimators = 100, max_depth=5, random_state = 0)
  forest.fit(X_train, y_train)
  y_pred = forest.predict(X_test)
  y_pred[cheater_index] = 1.0
  print('Random Forest MAE:', mean_absolute_error(y_test, y_pred))
  forest_importance = forest.feature_importances_
  for i in range(len(forest_importance)):
    print(feature_label[i],':',forest_importance[i])

In [None]:
my_Random_Forest(X_train, X_test, y_train, y_test)

  after removing the cwd from sys.path.


Random Forest MAE: 0.08764260715216278
walkDistance : 0.8102502153759079
killPlace : 0.1780354167236248
boosts : 0.003448885825669678
weaponsAcquired : 0.0
damageDealt : 0.0
heals : 0.0
kills : 0.005357766057693978
longestKill : 0.0
killStreaks : 0.002907716017103553


XGBoost Regression

In [None]:
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()

In [46]:
def my_xgb(X_train, X_test, y_train, y_test):
  import xgboost as xg
  from sklearn.model_selection import KFold
  from sklearn.model_selection import cross_val_score
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  X_train_std = scaler.fit_transform(X_train.copy())
  param = {"booster":"gblinear", "objective":"reg:squarederror"}
  xgb_reg = xg.XGBRegressor(booster = "gblinear",objective = "reg:squarederror",n_estimators = 100,max_depth=5)
  xgb_reg.fit(X_train_std,y_train)
  print(xgb_reg.coef_)
  y_pred = xgb_reg.predict(scaler.fit_transform(X_test.copy()))
  y_pred[cheater_index] = 1.0
  print('XGBoost MAE:', mean_absolute_error(y_test, y_pred))
  # xgb_importance = xgb_reg.feature_importances_
  # for i in range(len(xgb_importance)):
  #   print(feature_label[i],':',xgb_importance[i])

In [47]:
my_xgb(X_train, X_test, y_train, y_test)

[ 0.0753954  -0.056491    0.0386121   0.041611    0.00856005  0.0182801
  0.00325765  0.0125268  -0.00086056]
XGBoost MAE: 0.1417668324814438


KNN Regressor with 5-fold cross validation

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
def KNN_CV(X_train, y_train):
  knn_benchmark = list()
  for i in range(11,41,2):
    knn_reg = KNeighborsRegressor(n_neighbors = i)
    scores = cross_val_score(knn_reg,X_train,y_train,cv = 5,scoring='neg_mean_absolute_error')
    knn_benchmark.append(scores.mean())
  #from plot, k = 19 is the best
  plt.plot(range(11,41,2),knn_benchmark)
  plt.xlabel('K')
  plt.ylabel('MAE')
  print('k = 19, MAE=',knn_benchmark[4]*-1)

In [None]:
# KNN_CV(X_train, y_train)

In [None]:
def my_KNN(X_train, X_test, y_train, y_test):
  from sklearn.preprocessing import StandardScaler
  standardized_X_train = StandardScaler().fit_transform(X_train)
  knn_reg = KNeighborsRegressor(n_neighbors = 19)
  knn_reg.fit(standardized_X_train, y_train)
  standardized_X_test = StandardScaler().fit_transform(X_test)
  y_pred = knn_reg.predict(standardized_X_test)
  y_pred[cheater_index] = 1.0
  print('KNN MAE:', mean_absolute_error(y_test, y_pred))

In [None]:
my_KNN(X_train, X_test, y_train, y_test)

KNN MAE: 0.07760437645740403


SVM Regression (SVR) failed since it takes too much time

In [None]:
def my_SVM(X_train, X_test, y_train, y_test):
  from sklearn.preprocessing import StandardScaler
  from sklearn.svm import SVR
  standardized_X_train = StandardScaler().fit_transform(X_train)
  standardized_X_test = StandardScaler().fit_transform(X_test)
  svr_reg = SVR(kernel = 'rbf')
  svr_reg.fit(standardized_X_train, y_train)
  y_pred = svr_reg.predict(standardized_X_test)
  y_pred[cheater_index] = 1.0
  print('SVR MAE:', mean_absolute_error(y_test, y_pred))

In [None]:
# my_SVM(X_train, X_test, y_train, y_test)

Neutral Network

In [None]:
X_train.shape

(3333169, 9)

In [None]:
def my_neutral_network(X_train, X_test, y_train, y_test):
  import tensorflow as tf
  from keras.models import Sequential
  from keras.layers import Dense
  from sklearn.preprocessing import StandardScaler
  standardized_X_train = StandardScaler().fit_transform(X_train)
  standardized_X_test = StandardScaler().fit_transform(X_test)
  my_neutral_net = Sequential()

  my_neutral_net.add(Dense(8, activation='relu', input_shape=(9,)))
  my_neutral_net.add(Dense(8, activation='relu'))
  my_neutral_net.add(Dense(1, activation='sigmoid'))

  my_neutral_net.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['mean_absolute_error'])
  my_neutral_net.fit(standardized_X_train, y_train,epochs=10, batch_size=64)
  y_pred = my_neutral_net.predict(standardized_X_test)
  y_pred[cheater_index] = 1.0
  print('Neural Net MAE:', mean_absolute_error(y_test, y_pred))

In [None]:
my_neutral_network(X_train, X_test, y_train, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Net MAE: 0.07438011258221001
