In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from ngboost import NGBRegressor


In [4]:
train=pd.read_csv('FIFA_train.csv')
test =pd.read_csv('FIFA_test.csv')
sub=pd.read_csv('submission.csv')

In [5]:
train.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [6]:
test.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
0,1,Cristiano Ronaldo,33,europe,2022,ST,right,5.0,94,94,5.0
1,2,Neymar Jr,26,south america,2022,ST,right,5.0,92,93,5.0
2,4,K. De Bruyne,27,europe,2023,MF,right,4.0,91,92,4.0
3,5,E. Hazard,27,europe,2020,ST,right,4.0,91,91,4.0
4,6,L. Modrić,32,europe,2020,MF,right,4.0,91,91,4.0


In [13]:
def con_period(x):
    if x =='Dec 31, 2018':
        return '2019'
    elif x=='Jun 30, 2020':
        return '2020.5'
    elif x=='Jun 30, 2019':
        return '2019.5'
    elif x=='May 31, 2020':
        return '2020.3333'
    elif x=='May 31, 2019':
        return '2019.3333'
    elif x=='Jan 31, 2019':
        return '2019.0833'
    elif x=='Jan 1, 2019':
        return '2019'
    elif x=='Jan 12, 2019':
        return '2019.034'
    elif x=='Dec 31, 2020':
        return '2020'
    elif x=='Jun 1, 2019':
        return '2019.416'
    else:
        return x

In [14]:
train.contract_until = train.contract_until.apply(con_period).astype('float64') - 2018
test.contract_until = train.contract_until.apply(con_period).astype('float64') - 2018


In [15]:
train[['age', 'stat_potential']] = np.log1p(train[['age','stat_potential']])
test[['age', 'stat_potential']] = np.log1p(test[['age','stat_potential']])

In [16]:
X= train[['age', 'continent', 'contract_until', 'position', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']]
y= np.log1p(train['value'])

In [17]:
#train set
X = pd.get_dummies(columns =['continent', 'position'], data=X)

In [19]:
#test set
target= test[['age', 'continent', 'contract_until', 'position', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']]
target= pd.get_dummies(columns =['continent', 'position'],data=target)

In [20]:
#10-Fold Crssvalidation
kf= KFold(n_splits =10, random_state= 521, shuffle=True)

In [23]:
#ngbregressor
ngb= NGBRegressor(random_state=521, verbose =500, n_estimators =500)



In [25]:
ngb_pred = np.zeros((target.shape[0]))

In [29]:
rmse_list=[]
for tr_idx, val_idx in kf.split(X,y):
    tr_x, tr_y =X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    ngb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in ngb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in ngb.predict(target)]) /10
    ngb_pred += sub_pred
print(f'{ngb.__class__.__name__}의 10fold 평균RMSE는 {np.mean(rmse_list)}')



[iter 0] loss=-1.1775 val_loss=0.0000 scale=2.0000 norm=0.9135
[iter 100] loss=-1.2371 val_loss=0.0000 scale=1.0000 norm=0.4654
[iter 200] loss=-1.2772 val_loss=0.0000 scale=1.0000 norm=0.4726
[iter 300] loss=-1.3095 val_loss=0.0000 scale=2.0000 norm=0.9571
[iter 400] loss=-1.3356 val_loss=0.0000 scale=1.0000 norm=0.4834
[iter 0] loss=-1.3507 val_loss=0.0000 scale=1.0000 norm=0.4882
[iter 100] loss=-1.3725 val_loss=0.0000 scale=1.0000 norm=0.4864
[iter 200] loss=-1.3943 val_loss=0.0000 scale=2.0000 norm=0.9715
[iter 300] loss=-1.4158 val_loss=0.0000 scale=1.0000 norm=0.4858
[iter 400] loss=-1.4319 val_loss=0.0000 scale=1.0000 norm=0.4878
[iter 0] loss=-1.4051 val_loss=0.0000 scale=1.0000 norm=0.5126
[iter 100] loss=-1.4550 val_loss=0.0000 scale=1.0000 norm=0.4885
[iter 200] loss=-1.4714 val_loss=0.0000 scale=1.0000 norm=0.4905
[iter 300] loss=-1.4823 val_loss=0.0000 scale=1.0000 norm=0.4919
[iter 400] loss=-1.4918 val_loss=0.0000 scale=1.0000 norm=0.4918
[iter 0] loss=-1.4751 val_loss=

In [31]:
#randomforestregressor
rf =RandomForestRegressor(random_state= 521, n_estimators =150)


In [33]:
rf_pred = np.zeros((target.shape[0]))
for tr_idx, val_idx in kf.split(X,y):
    tr_x, tr_y =X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    rf.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in rf.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in rf.predict(target)]) /10
    rf_pred += sub_pred
print(f'{rf.__class__.__name__}의 10fold 평균RMSE는 {np.mean(rmse_list)}')


RandomForestRegressor의 10fold 평균RMSE는 576490.9265314413


In [35]:
#ExtraTreesRegressor
etc =ExtraTreesRegressor(random_state= 521, n_estimators =500)


In [36]:
etc_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    etc.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in etc.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in etc.predict(target)]) / 10
    etc_pred += sub_pred
print(f'{etc.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

ExtraTreesRegressor의 10fold 평균 RMSE는 748777.7449780542


In [42]:
gb = GradientBoostingRegressor(random_state = 521, max_depth = 5)

In [44]:
gb_pred= np.zeros((target.shape[0]))

rmse_list=[]
for tr_idx, val_idx in kf.split(X,y):
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    gb.fit(tr_x, tr_y)
    pred =np.expm1([0 if x < 0 else x for x in gb.predict(val_x)])

    rmse=np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in gb.predict(target)]) / 10
    gb_pred += sub_pred
print(f'{gb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

GradientBoostingRegressor의 10fold 평균 RMSE는 590051.6390746763


In [51]:
#LgbmRegressor
lgbm = LGBMRegressor(random_state = 521, max_depth = 4, n_estimators = 1000)

In [52]:
lgbm_pred= np.zeros((target.shape[0]))

rmse_list = []
for tr_idx, val_idx in kf.split(X,y):
    tr_x,tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    lgbm.fit(tr_x, tr_y)
    pred = np.expm1([0 if x< 0 else x for x in lgbm.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)]) / 10
    lgbm_pred += sub_pred
print(f'{lgbm.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

LGBMRegressor의 10fold 평균 RMSE는 600903.6346210752


In [53]:
#XgbRegressor
xgb = XGBRegressor(random_state = 521, max_depth = 5)

In [54]:
xgb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)
    
    sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(target)]) / 10
    xgb_pred += sub_pred
print(f'{xgb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

XGBRegressor의 10fold 평균 RMSE는 581307.2716821915


In [55]:
sub['value']=(gb_pred + rf_pred + etc_pred + lgbm_pred + ngb_pred + xgb_pred) / 6

In [56]:
q1 = sub['value'].quantile(0.004)
q2 = sub['value'].quantile(0.99)

sub['value'] = sub['value'].apply(lambda x: x if x > q1 else x * 0.9)
sub['value'] = sub['value'].apply(lambda x: x if x < q2 else x * 1.1)

In [57]:
sub

Unnamed: 0,id,value
0,1,6.348730e+07
1,2,9.987667e+07
2,4,8.420342e+07
3,5,8.797257e+07
4,6,6.808628e+07
...,...,...
3823,16924,5.859953e+04
3824,16929,5.021124e+04
3825,16932,6.061620e+04
3826,16937,4.907546e+04
