In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
import gc
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
%matplotlib inline
warnings.filterwarnings("ignore")

In [0]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [40]:
train = pd.read_csv('train_V2.csv')
train = train[train['maxPlace']>1]
train = train[train['winPlacePerc'].notnull()]
test = pd.read_csv('test_V2.csv')
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
print(train.shape, test.shape)

Memory usage of dataframe is 1017.83 MB
Memory usage after optimization is: 322.31 MB
Decreased by 68.3%
Memory usage of dataframe is 413.18 MB
Memory usage after optimization is: 121.74 MB
Decreased by 70.5%
(4446965, 29) (1934174, 28)


In [0]:
alldata = [train, test]



---


## Feature Engineering

### 추가한 변수들
* 매치에 참가한 사람 수 
* 힐+부스트
* 전체 움직인 거리 (걸어서, 차량 타고, 수영 통틀어)
* 시간 대비 움직인 거리
* 움직인 거리 대비 킬 수
* 헤드샷 비율
* 움직인 거리 대비 무기 획득 수
* 헤드샷, 로드킬, 차량 파괴 모두 어느 정도의 실력 필요 -> skill

In [0]:
for data in alldata :
  data['playersJoined'] = data.groupby('matchId')['matchId'].transform('count')
  data['healsAndBoosts'] = data['heals']+data['boosts']
  data['totalDistance'] = data['walkDistance']+ 0.25* data['rideDistance'] + data['swimDistance']
  data['distancePerDuration'] = data['totalDistance'] / data['matchDuration']
  data['killsPerDistance'] = data['kills']/(data['totalDistance']+1) 
  data['killsPerDistance'].fillna(0, inplace=True)
  data['headshotKillRate'] = data['headshotKills'] / data['kills']
  data['killStreakRate'] = data['killStreaks']/data['kills']
  data['weaponsPerDistance'] =data['weaponsAcquired']/(data['totalDistance']+1)
  data['weaponsPerDistance'].fillna(0, inplace=True)
  data['skill'] = data['headshotKills'] + data['roadKills'] + data['vehicleDestroys']

In [43]:
for data in alldata:
  data['matchType'] = data['matchType'].map({
      'crashfpp':1,
      'crashtpp':2,
      'duo':3,
      'duo-fpp':4,
      'flarefpp':5,
      'flaretpp':6,
      'normal-duo':7,
      'normal-duo-fpp':8,
      'normal-solo':9,
      'normal-solo-fpp':10,
      'normal-squad':11,
      'normal-squad-fpp':12,
      'solo':13,
      'solo-fpp':14,
      'squad':15,
      'squad-fpp':16
  })
  data = reduce_mem_usage(data)

Memory usage of dataframe is 483.47 MB
Memory usage after optimization is: 356.24 MB
Decreased by 26.3%
Memory usage of dataframe is 191.84 MB
Memory usage after optimization is: 136.50 MB
Decreased by 28.8%


In [0]:
for data in alldata:
  data.loc[(data['rankPoints']==-1), 'rankPoints'] = 0
  data['points'] = data['rankPoints']+data['killPoints']

In [0]:
for data in alldata:
  match = data.groupby('matchId')
  data['killsPerc'] = match['kills'].rank(pct=True).values
  data['killPlacePerc'] = match['killPlace'].rank(pct=True).values
  data['walkDistancePerc'] = match['walkDistance'].rank(pct=True).values

In [46]:
for data in alldata:
  data[data==np.Inf] = np.NaN
  data[data==np.NINF] = np.NaN
  data.fillna(0, inplace=True)
  
print(train.shape, test.shape)

(4446965, 42) (1934174, 41)


#### Group & Match
* 개인 변수뿐 아니라 그룹/매치 변수가 중요하다.
* 각 변수에 대해 그룹/매치 별 평균값 데이터 구하기

In [0]:
features = list(train.columns)
excl_col = ['Id','matchId','groupId','playersJoined','matchType','winPlacePerc']
for c in excl_col:
    features.remove(c)

In [48]:
def meandata(data):
  meanData = data.groupby(['matchId','groupId'])[features].agg('mean')
  meanData = reduce_mem_usage(meanData)
  meanData = meanData.replace([np.inf, np.NINF,np.nan], 0)
  meanDataRank = meanData.groupby('matchId')[features].rank(pct=True).reset_index()
  meanDataRank = reduce_mem_usage(meanDataRank)
  data = pd.merge(data, meanData.reset_index(), suffixes=["", "_mean"], how='left', on=['matchId', 'groupId'])
  del meanData
  gc.collect()
  data.drop(["vehicleDestroys_mean","rideDistance_mean","roadKills_mean","rankPoints_mean"], axis=1, inplace=True)
  data = pd.merge(data, meanDataRank, suffixes=["", "_meanRank"], how='left', on=['matchId', 'groupId'])
  del meanDataRank
  gc.collect()
  data.drop(["numGroups_meanRank","rankPoints_meanRank"], axis=1, inplace=True)
  data = reduce_mem_usage(data)
  return data

train = meandata(train)
test = meandata(test)

Memory usage of dataframe is 460.39 MB
Memory usage after optimization is: 170.46 MB
Decreased by 63.0%
Memory usage of dataframe is 587.59 MB
Memory usage after optimization is: 170.09 MB
Decreased by 71.1%
Memory usage of dataframe is 1026.31 MB
Memory usage after optimization is: 949.97 MB
Decreased by 7.4%
Memory usage of dataframe is 199.62 MB
Memory usage after optimization is: 72.84 MB
Decreased by 63.5%
Memory usage of dataframe is 256.94 MB
Memory usage after optimization is: 74.38 MB
Decreased by 71.1%
Memory usage of dataframe is 442.70 MB
Memory usage after optimization is: 409.49 MB
Decreased by 7.5%


In [49]:
def groupsize(data):
  groupSize = data.groupby(['matchId','groupId']).size().reset_index(name='group_size')
  groupSize = reduce_mem_usage(groupSize)
  data = pd.merge(data, groupSize, how='left', on=['matchId', 'groupId'])
  del groupSize
  gc.collect()
  return data

train = groupsize(train)
test = groupsize(test)

Memory usage of dataframe is 46.39 MB
Memory usage after optimization is: 32.86 MB
Decreased by 29.2%
Memory usage of dataframe is 20.28 MB
Memory usage after optimization is: 14.37 MB
Decreased by 29.2%


In [0]:
matchMeanFeatures = list(test.columns)[:41]
excl_col = ['killPlacePerc','matchDuration','maxPlace','numGroups']
for c in excl_col:
  matchMeanFeatures.remove(c)

In [58]:
def matchdata(data):
  meanData = data.groupby(['matchId'])[matchMeanFeatures].agg('mean')
  meanData = reduce_mem_usage(meanData)
  meanData = meanData.replace([np.inf, np.NINF,np.nan], 0)
  data = pd.merge(data, meanData.reset_index(), suffixes=["", "_matchMean"], how='left', on=['matchId'])
  del meanData
  gc.collect()
  return data

train = matchdata(train)
test = matchdata(test)

Memory usage of dataframe is 9.24 MB
Memory usage after optimization is: 3.48 MB
Decreased by 62.4%
Memory usage of dataframe is 3.96 MB
Memory usage after optimization is: 1.49 MB
Decreased by 62.4%


In [0]:
#dropping features
train.drop(['boosts','heals', 'headshotKills','roadKills','vehicleDestroys','killStreaks','rideDistance','swimDistance','matchDuration', 'maxPlace','numGroups','Id','groupId'], axis=1, inplace=True)
test.drop(['boosts','heals', 'headshotKills','roadKills','vehicleDestroys','killStreaks','rideDistance','swimDistance','matchDuration', 'maxPlace','numGroups','Id','groupId'], axis=1, inplace=True)

In [61]:
print(train.shape, test.shape)

(4446965, 130) (1934174, 129)


In [0]:
final_features = test.columns
final_features = final_features.drop('matchId')

### Train & Predict

In [0]:
def split_train_val(data, fraction):
    matchIds = data['matchId'].unique().reshape([-1])
    train_size = int(len(matchIds)*fraction)
    
    random_idx = np.random.RandomState(seed=2).permutation(len(matchIds))
    train_matchIds = matchIds[random_idx[:train_size]]
    val_matchIds = matchIds[random_idx[train_size:]]
    
    data_train = data.loc[data['matchId'].isin(train_matchIds)]
    data_val = data.loc[data['matchId'].isin(val_matchIds)]
    return data_train, data_val
  
X_train, X_val = split_train_val(train, 0.91)
del train
gc.collect()
y_train = X_train['winPlacePerc']
X_train = X_train.drop(columns=['matchId', 'winPlacePerc'])
y_val = X_val['winPlacePerc']
X_val = X_val.drop(columns=['matchId', 'winPlacePerc'])

In [64]:
X_val = np.array(X_val)
y_val = np.array(y_val)

X_train, X_train2, y_train, y_train2 = train_test_split(X_train, y_train, test_size=0.1, shuffle=False)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_train2 = np.array(X_train2)
y_train2 = np.array(y_train2)
y_train = np.concatenate((y_train, y_train2), axis=0)
del y_train2
gc.collect()
X_train = np.concatenate((X_train, X_train2), axis=0)
del X_train2
gc.collect()

0

In [65]:
train_set = lgb.Dataset(X_train, label=y_train)
del X_train,y_train
gc.collect()
valid_set = lgb.Dataset(X_val, label=y_val)
del X_val, y_val
gc.collect()

0

In [66]:
params = {
        "objective" : "regression", 
        "metric" : "mae", 
        "num_leaves" : 149, 
        "learning_rate" : 0.03, 
        "bagging_fraction" : 0.9,
        "bagging_seed" : 0, 
        "num_threads" : 4,
        "colsample_bytree" : 0.5,
        'min_data_in_leaf':1900, 
        'min_split_gain':0.00011,
        'lambda_l2':9
}

model = lgb.train(  params, 
                    train_set = train_set,
                    num_boost_round=9000,
                    early_stopping_rounds=200,
                    verbose_eval=100, 
                    valid_sets=[train_set,valid_set]
                  )
  
del train_set,valid_set
gc.collect()

Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 0.0428769	valid_1's l1: 0.0427829
[200]	training's l1: 0.0348968	valid_1's l1: 0.034997
[300]	training's l1: 0.0332657	valid_1's l1: 0.0335061
[400]	training's l1: 0.0321958	valid_1's l1: 0.0325481
[500]	training's l1: 0.0314616	valid_1's l1: 0.0319104
[600]	training's l1: 0.0308841	valid_1's l1: 0.0314321
[700]	training's l1: 0.0304251	valid_1's l1: 0.0310735
[800]	training's l1: 0.030031	valid_1's l1: 0.0307827
[900]	training's l1: 0.029702	valid_1's l1: 0.0305571
[1000]	training's l1: 0.0294084	valid_1's l1: 0.0303672
[1100]	training's l1: 0.0291466	valid_1's l1: 0.0301971
[1200]	training's l1: 0.0289103	valid_1's l1: 0.0300548
[1300]	training's l1: 0.028695	valid_1's l1: 0.0299334
[1400]	training's l1: 0.0284932	valid_1's l1: 0.029827
[1500]	training's l1: 0.0283092	valid_1's l1: 0.0297336
[1600]	training's l1: 0.0281407	valid_1's l1: 0.0296583
[1700]	training's l1: 0.0279822	valid_1's l1: 0.0295901

156

In [67]:
featureImp = list(model.feature_importance())
featureImp, features_label = zip(*sorted(zip(featureImp, final_features)))
with open("FeatureImportance.txt", "w") as text_file:
    for i in range(len(featureImp)):
        print(f"{final_features[i]} =  {featureImp[i]}", file=text_file)
del featureImp,final_features
gc.collect()

47

In [68]:
X_test = test.drop(columns=['matchId'])
del test
gc.collect()

7

In [70]:
X_test.shape

(1934174, 128)

In [71]:
X_test = np.array(X_test)
y_pred=model.predict(X_test, num_iteration=model.best_iteration)
len(y_pred)

1934174

In [72]:
del X_test
gc.collect()

435

In [73]:
# Insert ID and Predictions into dataframe
df_sub = pd.DataFrame()
df_test = pd.read_csv('test_V2.csv')
df_test = reduce_mem_usage(df_test)
df_sub['Id'] = df_test['Id']
df_sub['winPlacePerc'] = y_pred
print(df_sub['winPlacePerc'].describe())

Memory usage of dataframe is 413.18 MB
Memory usage after optimization is: 121.74 MB
Decreased by 70.5%
count    1.934174e+06
mean     4.731685e-01
std      3.041001e-01
min     -1.077412e-01
25%      2.024466e-01
50%      4.593228e-01
75%      7.431606e-01
max      1.126404e+00
Name: winPlacePerc, dtype: float64


In [0]:
df_sub = df_sub.merge(df_test[["Id", "matchId", "groupId", "maxPlace", "numGroups"]], on="Id", how="left")
df_sub_group = df_sub.groupby(["matchId", "groupId"]).first().reset_index()
df_sub_group["rank"] = df_sub_group.groupby(["matchId"])["winPlacePerc"].rank()
df_sub_group = df_sub_group.merge(
    df_sub_group.groupby("matchId")["rank"].max().to_frame("max_rank").reset_index(), 
    on="matchId", how="left")
df_sub_group["adjusted_perc"] = (df_sub_group["rank"] - 1) / (df_sub_group["numGroups"] - 1)
df_sub = df_sub.merge(df_sub_group[["adjusted_perc", "matchId", "groupId"]], on=["matchId", "groupId"], how="left")
df_sub["winPlacePerc"] = df_sub["adjusted_perc"]

In [0]:
df_sub.loc[df_sub.maxPlace == 0, "winPlacePerc"] = 0
df_sub.loc[df_sub.maxPlace == 1, "winPlacePerc"] = 1
subset = df_sub.loc[df_sub.maxPlace > 1]
gap = 1.0 / (subset.maxPlace.values - 1)
new_perc = np.around(subset.winPlacePerc.values / gap) * gap
df_sub.loc[df_sub.maxPlace > 1, "winPlacePerc"] = new_perc
# Edge case
df_sub.loc[(df_sub.maxPlace > 1) & (df_sub.numGroups == 1), "winPlacePerc"] = 0
assert df_sub["winPlacePerc"].isnull().sum() == 0

In [80]:
df_sub.shape

(1934174, 7)

In [0]:
df_sub[["Id", "winPlacePerc"]].to_csv("submission.csv", index=False)

제출 결과 : 0.02337