# Data Load

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error

In [2]:

train = pd.read_csv('/Users/krc/TIL/Mini_PJT/data/train_V2.csv')

test = pd.read_csv('/Users/krc/TIL/Mini_PJT/data/test_V2.csv')

submission = pd.read_csv('/Users/krc/TIL/Mini_PJT/data/sample_submission_V2.csv')

# Data 전처리

In [3]:
# 불필요하다 판단되는 column 제거
train.drop(['Id', 'numGroups', 'killPlace', 'matchId'], axis=1, inplace=True)
train

Unnamed: 0,groupId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPoints,kills,killStreaks,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,4d4b580de459be,0,0,0.00,0,0,0,1241,0,0,...,0,0.0000,0,0.000,0,0,244.80,1,1466,0.4444
1,684d5656442f9e,0,0,91.47,0,0,0,0,0,0,...,0,0.0045,0,11.040,0,0,1434.00,5,0,0.6400
2,6a4a42c3245a74,1,0,68.00,0,0,0,0,0,0,...,0,0.0000,0,0.000,0,0,161.80,2,0,0.7755
3,a930a9c79cd721,0,0,32.90,0,0,0,0,0,0,...,0,0.0000,0,0.000,0,0,202.70,3,0,0.1667
4,de04010b3458dd,0,0,100.00,0,0,0,0,1,1,...,0,0.0000,0,0.000,0,0,49.75,2,0,0.1875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,d238e426f50de7,0,0,0.00,0,0,0,1029,0,0,...,0,1292.0000,0,0.000,0,0,1019.00,3,1507,0.1786
4446962,408cdb5c46b2ac,0,1,44.15,0,0,0,0,0,0,...,0,0.0000,0,0.000,0,0,81.70,6,0,0.2935
4446963,e26ac84bdf7cef,0,0,59.06,0,0,0,0,0,0,...,0,0.0000,0,2.184,0,0,788.70,4,0,0.4815
4446964,c2223f35411394,0,4,180.40,1,1,2,0,2,1,...,2,0.0000,0,0.000,0,0,2748.00,8,0,0.8000


In [4]:
# 결측치 값 1개로 삭제

train.dropna(axis=0, inplace=True)
train.isnull().sum()

groupId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64

# feature engineering


In [5]:
# Creating cat codes of match type

train['groupId'] = train['groupId'].astype('category')
train['groupId'] = train['groupId'].cat.codes

train['matchType'] = train['matchType'].astype('category')
train['matchType'] = train['matchType'].cat.codes


In [6]:

train.isnull().sum()
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
df_mu = pd.DataFrame(fill_NaN.fit_transform(train))
train.columns = train.columns
train.index = train.index
train.head()
train.columns

Index(['groupId', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills',
       'heals', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'matchType', 'maxPlace', 'rankPoints', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'winPlacePerc'],
      dtype='object')

In [7]:
# data leakage column, drop

from sklearn.model_selection import train_test_split

y = train['winPlacePerc']
X = train.drop(['winPlacePerc'], axis=1)
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=1511)

In [8]:

# 불필요한 램 용량 관리
import gc
gc.collect()

0

In [9]:

# 학습을 위한 라이브러리 세팅
from sklearn.linear_model import LinearRegression   # 1. Linear Regression
from sklearn.linear_model import Lasso              # 2. Lasso
from sklearn.linear_model import Ridge              # 3. Ridge
from xgboost.sklearn import XGBRegressor            # 4. XGBoost
from lightgbm.sklearn import LGBMRegressor          # 5. LightGBM

from sklearn.metrics import mean_absolute_error

In [10]:
## training
reg = LinearRegression()
reg2 = Lasso()
reg3 = Ridge()
reg4 = XGBRegressor()
reg5 = LGBMRegressor()

reg.fit(train_X, train_y)
reg2.fit(train_X, train_y)
reg3.fit(train_X, train_y)
reg4.fit(train_X, train_y)
reg5.fit(train_X, train_y)

pred_train = reg.predict(train_X)
pred_train2 = reg2.predict(train_X)
pred_train3 = reg3.predict(train_X)
pred_train4 = reg4.predict(train_X)
pred_train5 = reg5.predict(train_X)
pred_val = reg.predict(val_X)
pred_val2 = reg2.predict(val_X)
pred_val3 = reg3.predict(val_X)
pred_val4 = reg4.predict(val_X)
pred_val5 = reg5.predict(val_X)

mae_train = mean_absolute_error(train_y, pred_train)
mae_val = mean_absolute_error(val_y, pred_val)

mae_train2 = mean_absolute_error(train_y, pred_train2)
mae_val2 = mean_absolute_error(val_y, pred_val2)

mae_train3 = mean_absolute_error(train_y, pred_train3)
mae_val3 = mean_absolute_error(val_y, pred_val3)

mae_train4 = mean_absolute_error(train_y, pred_train4)
mae_val4 = mean_absolute_error(val_y, pred_val4)

mae_train5 = mean_absolute_error(train_y, pred_train5)
mae_val5 = mean_absolute_error(val_y, pred_val5)