In [29]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import os

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import warnings; warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold

In [30]:
fifa_train=pd.read_csv("../FIFA_train.csv")
fifa_test=pd.read_csv("../FIFA_test.csv")

In [31]:
fifa_train['contract_until']=fifa_train['contract_until'].str.slice(-4,).astype(float)
fifa_test['contract_until']=fifa_test['contract_until'].str.slice(-4,).astype(float)

fifa_train['prefer_foot']=fifa_train['prefer_foot'].map({'left':0,'right':1})
fifa_test['prefer_foot']=fifa_test['prefer_foot'].map({'left':0,'right':1})

fifa_train['position']=fifa_train['position'].map({'ST':0,'GK':1,'DF':2,'MF':4})
fifa_test['position']=fifa_test['position'].map({'ST':0,'GK':1,'DF':2,'MF':4})

fifa_train['continent']=fifa_train['continent'].map({'south america':0,
                                                     'europe':1,
                                                     'africa':2,
                                                     'asia':3,
                                                     'oceania':4})
fifa_test['continent']=fifa_test['continent'].map({'south america':0,
                                                     'europe':1,
                                                     'africa':2,
                                                     'asia':3,
                                                     'oceania':4})

In [32]:
fifa_train['total_rep_skill']=fifa_train['reputation']+fifa_train['stat_skill_moves']
fifa_test['total_rep_skill']=fifa_test['reputation']+fifa_test['stat_skill_moves']

fifa_train['diff_rep_skill']=fifa_train['reputation']-fifa_train['stat_skill_moves']
fifa_test['diff_rep_skill']=fifa_test['reputation']-fifa_test['stat_skill_moves']

fifa_train['total_stat']=fifa_train['stat_potential']+fifa_train['stat_overall']
fifa_test['total_stat']=fifa_test['stat_potential']+fifa_test['stat_overall']

fifa_train['diff_stat']=fifa_train['stat_potential']-fifa_train['stat_overall']
fifa_test['diff_stat']=fifa_test['stat_potential']-fifa_test['stat_overall']

fifa_train['total_stat']=np.log(fifa_train['total_stat'])
fifa_test['total_stat']=np.log(fifa_test['total_stat'])

In [33]:
fifa_train['continent_ratio']=[0]*len(fifa_train)

for i,n in zip((fifa_train['continent'].value_counts()/len(fifa_train)).index,(fifa_train['continent'].value_counts()/len(fifa_train)).values):
    fifa_train.loc[fifa_train['continent']==i,'continent_ratio']=n
    
    
fifa_train['position_ratio']=[0]*len(fifa_train)

for i,n in zip((fifa_train['position'].value_counts()/len(fifa_train)).index,(fifa_train['position'].value_counts()/len(fifa_train)).values):
    fifa_train.loc[fifa_train['position']==i,'position_ratio']=n
    

fifa_train['foot_ratio']=[0]*len(fifa_train)

for i,n in zip((fifa_train['prefer_foot'].value_counts()/len(fifa_train)).index,(fifa_train['prefer_foot'].value_counts()/len(fifa_train)).values):
    fifa_train.loc[fifa_train['prefer_foot']==i,'foot_ratio']=n

In [34]:
fifa_test['continent_ratio']=[0]*len(fifa_test)

for i,n in  zip((fifa_train['continent'].value_counts()/len(fifa_train)).index,(fifa_train['continent'].value_counts()/len(fifa_train)).values):
    fifa_test.loc[fifa_test['continent']==i,'continent_ratio']=n
    
    
fifa_test['position_ratio']=[0]*len(fifa_test)

for i,n in zip((fifa_train['position'].value_counts()/len(fifa_train)).index,(fifa_train['position'].value_counts()/len(fifa_train)).values):
    fifa_test.loc[fifa_test['position']==i,'position_ratio']=n
    

fifa_test['foot_ratio']=[0]*len(fifa_test)

for i,n in zip((fifa_train['prefer_foot'].value_counts()/len(fifa_train)).index,(fifa_train['prefer_foot'].value_counts()/len(fifa_train)).values):
    fifa_test.loc[fifa_test['prefer_foot']==i,'foot_ratio']=n

In [38]:
new_train=fifa_train.copy()
new_test=fifa_test.copy()

In [36]:
target=np.log1p(new_train['value'])
new_train.drop(['id','name','value'],axis=1,inplace=True)
new_test.drop(['id','name'],axis=1,inplace=True)


lgbm = LGBMRegressor(boosting_type="dart",random_state = 0)
lgbm.fit(new_train,target)

prediction=np.expm1(lgbm.predict(new_test))

sub_example=pd.read_csv("../submission.csv")
sub_example['value']=prediction
sub_example.to_csv("../submission.csv",index=False)

In [39]:
target=np.log1p(new_train['value'])
new_train.drop(['id','name','value'],axis=1,inplace=True)
new_test.drop(['id','name'],axis=1,inplace=True)

In [40]:

import lightgbm as lgb

In [41]:
def get_oof_lgbm(params, train_data, test_data, target_data, num_round, early_round, verbose_round, N_SPLITS=5, random_state=0):

    FOLDs=KFold(n_splits=N_SPLITS, shuffle=True,random_state=0)

    oof = np.zeros(len(train_data))
    predictions = np.zeros(len(test_data))

    features_lgb = list(train_data.columns)
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(FOLDs.split(train_data)):
        trn_data = lgb.Dataset(train_data.iloc[trn_idx], label=target_data.iloc[trn_idx])
        val_data = lgb.Dataset(train_data.iloc[val_idx], label=target_data.iloc[val_idx])

        print("LGB " + str(fold_) + "-" * 50)
        num_round = num_round
        clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=verbose_round, 
                        early_stopping_rounds = early_round)
        oof[val_idx] = clf.predict(train_data.iloc[val_idx], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features_lgb
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        predictions += clf.predict(test_data, num_iteration=clf.best_iteration) / FOLDs.n_splits
    return oof, predictions, feature_importance_df

In [47]:
import xgboost as xgb
def get_oof_xgb(params, train_data, test_data, target_data, num_round, early_round, verbose_round, N_SPLITS=5, random_state=0):

    FOLDs=KFold(n_splits=N_SPLITS, shuffle=True,random_state=0)

    oof = np.zeros(len(train_data))
    predictions = np.zeros(len(test_data))

    feature_importance_df = pd.DataFrame()

    best_iters = []
    for fold_, (trn_idx, val_idx) in enumerate(FOLDs.split(train_data)):
        trn_data = xgb.DMatrix(train_data.iloc[trn_idx], label=target_data.iloc[trn_idx])
        val_data = xgb.DMatrix(train_data.iloc[val_idx], label=target_data.iloc[val_idx])

#         print("xgb " + str(fold_) + "-" * 50)

        watchlist = [(trn_data, 'train'), (val_data, 'valid')]
#         print("xgb " + str(fold_) + "-" * 50)
        num_round = num_round
        xgb_model = xgb.train(params, trn_data, num_round, watchlist, 
                              early_stopping_rounds=early_round, verbose_eval=verbose_round)
        oof[val_idx] = xgb_model.predict(xgb.DMatrix(train_data.iloc[val_idx]), 
                                             ntree_limit=xgb_model.best_ntree_limit)

        predictions += xgb_model.predict(xgb.DMatrix(test_data), 
                                             ntree_limit=xgb_model.best_ntree_limit) / FOLDs.n_splits
        
        fold_importance_df = pd.DataFrame([xgb_model.get_score()]).T.reset_index()
        fold_importance_df.columns = ['feature', 'importance']
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        best_iters.append(xgb_model.best_ntree_limit)
    return oof, predictions, feature_importance_df, np.mean(best_iters)

In [48]:
xgb_params={"objective":"reg:linear",
           "metric":"rmse",
           "max_depth":6,
           "min_child_samples":2,
           "alpha":0.08,
           "gamma":0.06,
           "eta":0.04,
           "subsample":0.08,
           "colsample_bytree":0.97,
           "silent":True}
a,pre_xgb,feature_impo_xgb,d=get_oof_xgb(xgb_params, new_train, new_test, target, num_round=100000, early_round=400, verbose_round=500, N_SPLITS=5, random_state=0)

Parameters: { metric, min_child_samples, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:12.83666	valid-rmse:12.84015
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 400 rounds.
[500]	train-rmse:0.07003	valid-rmse:0.07695
[1000]	train-rmse:0.06572	valid-rmse:0.07374
[1500]	train-rmse:0.06369	valid-rmse:0.07227
[2000]	train-rmse:0.06229	valid-rmse:0.07161
[2500]	train-rmse:0.06136	valid-rmse:0.07132
[3000]	train-rmse:0.06051	valid-rmse:0.07057
[3500]	train-rmse:0.05998	valid-rmse:0.07046
Stopping. Best iteration:
[3289]	train-rmse:0.06009	valid-rmse:0.07035

Parameters: { metric, min_child_samples, silent } might not be used.

  This may not be accurate due to some parame

In [49]:
prediction=np.expm1(pre_xgb)

sub_example=pd.read_csv("../submission.csv")
sub_example['value']=prediction
sub_example.to_csv("../submission.csv",index=False)