In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

%matplotlib inline

In [None]:
!ls

In [None]:
!rm -rf -r *.xlsx
!ls

# Data Import

In [None]:
df = pd.read_csv('./Complete Pokedex V1.1.csv', engine='python', low_memory=True)
display(df.head())
display(df.tail())
print(df.shape)

* 데이터를 살펴본 결과
    - 범주형 변수가 매우 많음
    - 서로 겹치는 값들, 즉 variability가 그다지 높지 않은 변수들이 매우 많음
    - 따라서 logistic regression, LDA 등의 linear model보다는 Tree 계열의 모델이 적절할 것으로 사료됨

In [None]:
df.isnull().sum()[df.isnull().sum() > 0]

# First task - Type Prediction

In [None]:
train = df[df['generation'] < 8]
test = df[df['generation'] == 8]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
print(train.shape, test.shape)

# EDA + Feature Engineering

In [None]:
## generation별 pokemon 마리 수
gen_count = train[['generation', 'pokemon_name']].groupby(by=['generation']).count()
gen_count.reset_index(drop=False, inplace=True)

plt.figure(figsize=(10, 5))
sns.barplot(x='generation', y='pokemon_name', data=gen_count)
plt.grid(True)
plt.show()

In [None]:
## generation별 type의 개수
gen_type = train[['generation', 'type_1', 'pokemon_name']].groupby(by=['generation', 'type_1']).count()
gen_type.reset_index(drop=False, inplace=True)

plt.figure(figsize=(36, 6))
for gen in sorted(gen_type['generation'].unique()):
    subdf = gen_type[gen_type['generation'] == gen]
    plt.subplot(int(f"24{gen}"))
    sns.barplot(x='type_1', y='pokemon_name', data=subdf)
    plt.title(f"Generation {gen}")
    plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
## type별 pokemon 마리 수
type_count = train[['type_1', 'pokemon_name']].groupby(by=['type_1']).count()
type_count.reset_index(drop=False, inplace=True)

plt.figure(figsize=(11.5, 4))
sns.barplot(x='type_1', y='pokemon_name', data=type_count)
plt.grid(True)
plt.show()

* type별 분포가 상당히 불균형 - 만약에 어떤 새로운 포켓몬이 들어올 경우 물타입으로 분류할 가능성이 높아짐 - sampling 필요할듯?

In [None]:
## type별 height and weight에 차이가 있을까?
type_compare = train[['type_1', 'height', 'weight']].groupby(by=['type_1']).mean(numeric_only=False)
type_compare.reset_index(drop=False, inplace=True)

plt.figure(figsize=(11, 8))
plt.subplot(211)
sns.barplot(x='type_1', y='height', data=type_compare)
plt.grid(True)

plt.subplot(212)
sns.barplot(x='type_1', y='weight', data=type_compare)
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
## type별 hit point
type_compare = train[['type_1', 'hit_points']].groupby(by=['type_1']).mean(numeric_only=False)
type_compare.reset_index(drop=False, inplace=True)

plt.figure(figsize=(11.5, 5))
sns.barplot(x='type_1', y='hit_points', data=type_compare)
plt.grid(True)
plt.show()

* Type 관련 결측치 문제
    - type2는 포켓몬 도감에 이미 기분류되어 있음 <- 즉 결측치를 제거하는 것이 의미가 없음; 오히려 데이터 왜곡할 수 있는 가능성
    - 따라서 type과 ability의 경우 결측치가 있는 column은 아예 삭제하는 편이 나음 - 그냥 type의 개수, ability의 개수로 대체

In [None]:
def missing_transformer(df):
    type2_null = df[df['type_2'].isnull()].index
    abil2_null = df[df['ability_2'].isnull()].index
    abil3_null = df[df['ability_3'].isnull()].index
    egg2_null = df[df['egg_group_2'].isnull()].index

    num_types = np.zeros(shape=df.shape[0])
    num_abilities = np.zeros(shape=df.shape[0])
    num_eggs = np.zeros(shape=df.shape[0])
    for idx in range(df.shape[0]):
        if idx in type2_null:
            num_types[idx] = 1
        else:
            num_types[idx] = 2

        if (idx in abil2_null) and (idx in abil3_null):
            num_abilities[idx] = 1
        elif (idx in abil2_null) or (idx in abil3_null):
            num_abilities[idx] = 2
        else:
            num_abilities[idx] = 3

        if idx in egg2_null:
            num_eggs[idx] = 1
        else:
            num_eggs[idx] = 2

    df['num_types'] = num_types.astype('int')
    df['num_abilities'] = num_abilities.astype('int')
    df['num_egg_groups'] = num_eggs.astype('int')

    ## 진화형태 결측치 역시 진화 단계를 구분할 수 있는 범주형 정수로 대체
    df['evolves_from'].fillna(0, inplace=True)

    ## evolution type - 0: basic / 1: intermediate / 2: final
    evolution = np.zeros(shape=df.shape[0], dtype='int')
    for i in range(df.shape[0]):
        item = df.iloc[i]
        if item['evolves_from'] == 0 and item['final_evolution'] == False:
            evolution[i] = 0
        elif item['final_evolution'] == False:
            evolution[i] = 1
        else:
            evolution[i] = 2

    df['evolution_type'] = evolution
    df.drop(['type_2', 'ability_2', 'ability_3', 'egg_group_2', 'evolves_from', 'final_evolution'], 
            axis=1, inplace=True)

    display(df.head())

In [None]:
missing_transformer(train)
missing_transformer(test)

In [None]:
# 굳이 다른 stat 정보가 있는 상황에서 total, mean, std는 불필요한 정보
train.drop(['total_stats', 'mean', 'standard_deviation'], axis=1, inplace=True)
test.drop(['total_stats', 'mean', 'standard_deviation'], axis=1, inplace=True)

In [None]:
## height, weight, bmi distribution
plt.figure(figsize=(10, 10))

plt.subplot(321)
sns.histplot(train['height'], kde=True)
plt.title('Height')
plt.grid(True)

plt.subplot(322)
sns.boxplot(train['height'], orient='h')
plt.title('Height')
plt.grid(True)

plt.subplot(323)
sns.histplot(train['weight'], kde=True)
plt.title('Weight')
plt.grid(True)

plt.subplot(324)
sns.boxplot(train['weight'], orient='h')
plt.title('Weight')
plt.grid(True)

plt.subplot(325)
sns.histplot(train['bmi'], kde=True)
plt.title('BMI')
plt.grid(True)

plt.subplot(326)
sns.boxplot(train['bmi'], orient='h')
plt.title('BMI')
plt.grid(True)

plt.tight_layout()
plt.show()

~~포켓몬 도감을 찾아보니 Gmax(거다이맥스) 또는 Eternamax(무한다이맥스)는 몸무게가 ???로 찍혀있어 10000으로 퉁친듯...~~
* measurement error나 data entry error가 아니므로 굳이 제거할 필요가 있을까...?
* 그리고 이렇게 치우친 분포를 가진 이상 더더욱 tree 계열 모델로 가야될 듯

In [None]:
## category 변수 처리
from sklearn.preprocessing import LabelEncoder

concat = pd.concat([train, test], axis=0)
concat.drop(['pokedex_number', 'pokemon_name'], axis=1, inplace=True)

cats = []
cons = []
for col in concat.columns.values:
    if concat[col].dtype in [object, bool]:
        cats.append(col)
    else:
        cons.append(col)

print(f"Categorical features: {cats}")
print(f"Continuous features: {cons}")

In [None]:
for col in cats:
    if concat[col].dtype == object:
        le = LabelEncoder()
        le.fit(concat[col])
        concat[col] = le.transform(concat[col])
    else:
        concat[col] = concat[col].astype(np.uint8)

display(concat.head())

In [None]:
train = concat[concat['generation'] < 8]
test = concat[concat['generation'] == 8]

In [None]:
del concat

## Height - Weight and Stats

In [None]:
## 전체
imp_corr = train[["bmi", "height", "weight", "hit_points", "attack", "defense"]].corr()

plt.figure(figsize=(6, 5))
sns.heatmap(
    data=imp_corr,
    annot=True,
    linewidths=.5,
    cmap = 'RdYlBu_r',
    vmin = -1, vmax = 1,
)
plt.show()

In [None]:
## by generation
gens = sorted(train['generation'].unique())

plt.figure(figsize=(14, 20))
for gen in gens:
    subdf = train[train['generation'] == gen]
    subcorr = subdf[["bmi", "height", "weight", "hit_points", "attack", "defense"]].corr()
    plt.subplot(int(f"42{gen}"))
    sns.heatmap(
        data=subcorr,
        annot=True,
        linewidths=.5,
        cmap = 'RdYlBu_r',
        vmin = -1, vmax = 1,
    )
    plt.title(f"Generation {gen}")

In [None]:
## scatterplot - height
heights = train[["height", "generation", "hit_points", "attack", "defense"]]

plt.figure(figsize=(13, 30))
i = 1
for gen in sorted(heights['generation'].unique()):
    subdf = heights[(heights['generation'] == gen)]
    for col in subdf.columns.values[2:]:
        plt.subplot(8, 3, i)
        sns.scatterplot(data=subdf, x='height', y=col)
        plt.grid(True)
        plt.title(f"Generation {gen} - height & {col}")
        i += 1
plt.tight_layout()
plt.show()

In [None]:
## scatterplot - weight
weights = train[["weight", "generation", "hit_points", "attack", "defense"]]

plt.figure(figsize=(13, 30))
i = 1
for gen in sorted(weights['generation'].unique()):
    subdf = weights[(weights['generation'] == gen)]
    for col in subdf.columns.values[2:]:
        plt.subplot(8, 3, i)
        sns.scatterplot(data=subdf, x='weight', y=col)
        plt.grid(True)
        plt.title(f"Generation {gen} - weight & {col}")
        i += 1
plt.tight_layout()
plt.show()

In [None]:
del imp_corr, heights, weights

~~Gmax는 generation 8에만 존재... <- training을 어떻게 시켜야 하나..~~

In [None]:
## Generation에 따른 height, weight 평균 및 분포 변화
groupby = train[['generation', 'height', 'weight']].groupby(by='generation').mean()
groupby.sort_values(by='generation', inplace=True)
groupby.reset_index(drop=False, inplace=True)

plt.figure(figsize=(10, 8))

plt.subplot(221)
sns.barplot(x='generation', y='height', data=groupby)
plt.grid(True)
plt.title("Mean Height over Generation")

plt.subplot(222)
sns.boxplot(x='generation', y='height', data=train)
plt.grid(True)
plt.title("Boxplot of Height over Generation")

plt.subplot(223)
sns.barplot(x='generation', y='weight', data=groupby)
plt.grid(True)
plt.title("Mean Weight over Generation")

plt.subplot(224)
sns.boxplot(x='generation', y='weight', data=train)
plt.grid(True)
plt.title("Boxplot of Weight over Generation")

plt.tight_layout()
plt.show()

## Regression

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
## regression 위해서 scaling
imp_train = train[["generation", "height", "weight", "hit_points", "attack", "defense"]]
imp_test = test[["generation", "height", "weight", "hit_points", "attack", "defense"]]

# Min-Max scaling
imp_train_, imp_test_ = imp_train.drop('generation', axis=1), imp_test.drop('generation', axis=1)

minmax_height = MinMaxScaler()
minmax_height.fit(imp_train_[['height']])
imp_train_['height'] = minmax_height.transform(imp_train_[['height']])
imp_test_['height'] = minmax_height.transform(imp_test_[['height']])

minmax_weight = MinMaxScaler()
minmax_weight.fit(imp_train_[['weight']])
imp_train_['weight'] = minmax_weight.transform(imp_train_[['weight']])
imp_test_['weight'] = minmax_weight.transform(imp_test_[['weight']])

In [None]:
# Standard scaling
imp_train__, imp_test__ = imp_train.drop('generation', axis=1), imp_test.drop('generation', axis=1)

std_height = StandardScaler()
std_height.fit(imp_train__[['height']])
imp_train__['height'] = std_height.transform(imp_train__[['height']])
imp_test__['height'] = std_height.transform(imp_test__[['height']])

std_weight = StandardScaler()
std_weight.fit(imp_train__[['weight']])
imp_train__['weight'] = std_weight.transform(imp_train__[['weight']])
imp_test__['weight'] = std_weight.transform(imp_test__[['weight']])

In [None]:
## for problems
import statsmodels.api as sm

In [None]:
def regression(X, y, constant:bool=True, logit:bool=False): 
    if constant:
        X = sm.add_constant(X)
    
    if logit:
        model = sm.Logit(y.astype(float), X.astype(float))
    else:
        model = sm.OLS(y.astype(float),X.astype(float))
        
    return model

### Minmax Scaling

In [None]:
X = imp_train_[['height', 'weight']]
y = imp_train_['hit_points']
fit = regression(X=X, y=y)
print(fit.fit().summary())

In [None]:
X = imp_train_[['height', 'weight']]
y = imp_train_['attack']
fit = regression(X=X, y=y)
print(fit.fit().summary())

In [None]:
X = imp_train_[['height', 'weight']]
y = imp_train_['defense']
fit = regression(X=X, y=y)
print(fit.fit().summary())

### Standard Scaling

In [None]:
X = imp_train__[['height', 'weight']]
y = imp_train__['hit_points']
fit = regression(X=X, y=y)
print(fit.fit().summary())

In [None]:
X = imp_train__[['height', 'weight']]
y = imp_train__['attack']
fit = regression(X=X, y=y)
print(fit.fit().summary())

In [None]:
X = imp_train__[['height', 'weight']]
y = imp_train__['defense']
fit = regression(X=X, y=y)
print(fit.fit().summary())

### Log transformation

In [None]:
# Standard scaling
imp_train___, imp_test___ = imp_train.drop('generation', axis=1), imp_test.drop('generation', axis=1)

imp_train___['height'] = np.log(imp_train___['height']*100)
imp_test___['height'] = np.log(imp_test___['height']*100)

imp_train___['weight'] = np.log(imp_train___['weight']*1000)
imp_test___['weight'] = np.log(imp_test___['weight']*1000)

imp_train___.head()

In [None]:
X = imp_train___[['height', 'weight']]
y = imp_train___['hit_points']
fit = regression(X=X, y=y)
print(fit.fit().summary())

In [None]:
X = imp_train___[['height', 'weight']]
y = imp_train___['attack']
fit = regression(X=X, y=y)
print(fit.fit().summary())

In [None]:
X = imp_train___[['height', 'weight']]
y = imp_train___['defense']
fit = regression(X=X, y=y)
print(fit.fit().summary())

# Over Generation

In [None]:
for gen in sorted(train['generation'].unique()):
    subdf = imp_train[imp_train['generation'] == gen]
    subdf['height'] = np.log(subdf['height']*100)
    subdf['weight'] = np.log(subdf['weight']*1000)
    print(f"Generation {gen}")
    for target in subdf.columns.values[3:]:
        X = subdf[['height', 'weight']]
        y = subdf[target]
        fit = regression(X=X, y=y)
        pvals = fit.fit().pvalues[['height', 'weight']]
        rsquared = fit.fit().rsquared
        print(f"\t{target}  \t{pvals.index.values}: {np.around(pvals, decimals=5).values}  \tr2: {rsquared}")
        print(f"\tNumber of significant variables: {(pvals <= 0.05).sum()}")

In [None]:
del imp_train, imp_train_, imp_train__, imp_train___, imp_test, imp_test_, imp_test__, imp_test___

# Modeling

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
RANDOM_STATE = 24790
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
rf = RandomForestClassifier(random_state=RANDOM_STATE)
et = ExtraTreesClassifier(random_state=RANDOM_STATE)
gbr = GradientBoostingClassifier(random_state=RANDOM_STATE)
xgb = XGBClassifier(random_state=RANDOM_STATE)
lgb = LGBMClassifier(random_state=RANDOM_STATE)
# cb = CatBoostClassifier(random_state=RANDOM_STATE, cat_features=cats.remove('type_1'))

## Task one: Type classification

In [None]:
display(train.head())

In [None]:
x_train, y_train = train.drop('type_1', axis=1), train['type_1']
x_test, y_test = test.drop('type_1', axis=1), test['type_1']

In [None]:
models = [dt, rf, et, gbr, xgb, lgb]
for i, model in enumerate(models):
    accuracy = cross_val_score(model, X=x_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
    f1 = cross_val_score(model, X=x_train, y=y_train, scoring='f1_weighted', cv=10, n_jobs=-1)
    if i < 4:
        print(f"{model.__class__.__name__}    \tAccuracy: {np.mean(accuracy):.4f}\tf1: {np.mean(f1):.4f}")
    else:
        print(f"{model.__class__.__name__}    \t\tAccuracy: {np.mean(accuracy):.4f}\tf1: {np.mean(f1):.4f}")

In [None]:
## Model tuning
import optuna

N_SPLITS = 5
N_TRIALS = 30

In [None]:
final_models = {
    "et": ExtraTreesClassifier,
    "gbr": GradientBoostingClassifier,
    "xgb": XGBClassifier,
    "lgb": LGBMClassifier,
    "rf": RandomForestClassifier
}

In [None]:
task1_best_scores = []
task1_best_params = []

In [None]:
## ET
def et_objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 40),
        'n_estimators': trial.suggest_int('n_estimators', 500, 3500),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0)
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
#     accs = np.zeros(shape=fold.get_n_splits())
    f1s = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model = ExtraTreesClassifier(**param, random_state=RANDOM_STATE)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
#         accuracy = accuracy_score(y_true=y_val, y_pred=y_pred)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

#         accs[trial] = accuracy
        f1s[trial] = f1
        trial += 1
        
    return np.mean(f1s)

et_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
et_study.optimize(et_objective, n_trials=N_TRIALS)

et_best = et_study.best_trial
et_best_params = et_best.params
print(f'\nscore: {et_best.value:.10f}\nparams: {et_best_params}')

task1_best_scores.append(et_best.value)
task1_best_params.append(et_best_params)

In [None]:
def gbr_objective(trial):
    param = {
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2500),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0)
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    scores = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        model = GradientBoostingClassifier(**param, verbose=0, random_state=RANDOM_STATE)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

        scores[trial] = f1
        trial += 1
        
    return np.mean(scores)

gbr_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
gbr_study.optimize(gbr_objective, n_trials=N_TRIALS)

gbr_best = gbr_study.best_trial
gbr_best_params = gbr_best.params
print(f'\nscore: {gbr_best.value:.10f}\nparams: {gbr_best_params}')

task1_best_scores.append(gbr_best.value)
task1_best_params.append(gbr_best_params)

In [None]:
def xgb_objective(trial):
    param = {
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.01, 10.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3500),
        'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.1, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    scores = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model = XGBClassifier(**param, n_jobs=-1, random_state=RANDOM_STATE, verbosity=0)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

        scores[trial] = f1
        trial += 1
        
    return np.mean(scores)

xgb_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
xgb_study.optimize(xgb_objective, n_trials=N_TRIALS)

xgb_best = xgb_study.best_trial
xgb_best_params = xgb_best.params
print(f'\nscore: {xgb_best.value:.10f}\nparams: {xgb_best_params}')

task1_best_scores.append(xgb_best.value)
task1_best_params.append(xgb_best_params)

In [None]:
def lgb_objective(trial):
    param = {
        'num_leaves': trial.suggest_int('num_leaves', 100, 3500),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3500),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    scores = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model = LGBMClassifier(**param, n_jobs=-1, random_state=RANDOM_STATE, verbose=-1)        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

        scores[trial] = f1
        trial += 1
        
    return np.mean(scores)

lgb_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
lgb_study.optimize(lgb_objective, n_trials=N_TRIALS)

lgb_best = lgb_study.best_trial
lgb_best_params = lgb_best.params
print(f'\nscore: {lgb_best.value:.10f}\nparams: {lgb_best_params}')

task1_best_scores.append(lgb_best.value)
task1_best_params.append(lgb_best_params)

In [None]:
def rf_objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 3500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    scores = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model = RandomForestClassifier(**param, random_state=RANDOM_STATE)        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

        scores[trial] = f1
        trial += 1
        
    return np.mean(scores)

rf_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
rf_study.optimize(rf_objective, n_trials=N_TRIALS)

rf_best = rf_study.best_trial
rf_best_params = rf_best.params
print(f'\nscore: {rf_best.value:.10f}\nparams: {rf_best_params}')

task1_best_scores.append(rf_best.value)
task1_best_params.append(rf_best_params)

In [None]:
best_index = np.argmax(task1_best_scores)
best_param = task1_best_params[best_index]
best_model_key = list(final_models.keys())[best_index]
best_model = final_models[best_model_key](**best_param, random_state=RANDOM_STATE)
print(f"Best model: {best_model.__class__.__name__}")

best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
test_accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
test_f1 = f1_score(y_true=y_test, y_pred=y_pred, average="weighted")
print(f"Test accuracy: {test_accuracy:.4f}\tTest F1: {test_f1:.4f}")

In [None]:
task1_best_scores

## Task two: Mythical classification

In [None]:
lm_train = np.zeros(train.shape[0])
for i in range(train.shape[0]):
    if train['legendary'].iloc[i] == 0 and train['mythical'].iloc[i] == 0:
        lm_train[i] = 0
    elif train['legendary'].iloc[i] == 0 and train['mythical'].iloc[i] == 1:
        lm_train[i] = 1
    elif train['legendary'].iloc[i] == 1 and train['mythical'].iloc[i] == 0:
        lm_train[i] = 2
    else:
        lm_train[i] = 3

lm_test = np.zeros(test.shape[0])
for i in range(test.shape[0]):
    if test['legendary'].iloc[i] == 0 and test['mythical'].iloc[i] == 0:
        lm_test[i] = 0
    elif test['legendary'].iloc[i] == 0 and test['mythical'].iloc[i] == 1:
        lm_test[i] = 1
    elif test['legendary'].iloc[i] == 1 and test['mythical'].iloc[i] == 0:
        lm_test[i] = 2
    else:
        lm_test[i] = 3
        
train['lm'] = lm_train
test['lm'] = lm_test

In [None]:
x_train, y_train = train.drop(['legendary', 'mythical', 'lm'], axis=1), train['lm']
x_test, y_test = test.drop(['legendary', 'mythical', 'lm'], axis=1), test['lm']

In [None]:
models = [dt, rf, et, gbr, xgb, lgb]
for i, model in enumerate(models):
    accuracy = cross_val_score(model, X=x_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
    f1 = cross_val_score(model, X=x_train, y=y_train, scoring='f1_weighted', cv=10, n_jobs=-1)
    if i < 4:
        print(f"{model.__class__.__name__}    \tAccuracy: {np.mean(accuracy):.4f}\tf1: {np.mean(f1):.4f}")
    else:
        print(f"{model.__class__.__name__}    \t\tAccuracy: {np.mean(accuracy):.4f}\tf1: {np.mean(f1):.4f}")

In [None]:
task2_best_scores = []
task2_best_params = []

In [None]:
## ET
def et_objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 40),
        'n_estimators': trial.suggest_int('n_estimators', 500, 3500),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0)
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
#     accs = np.zeros(shape=fold.get_n_splits())
    f1s = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model = ExtraTreesClassifier(**param, random_state=RANDOM_STATE)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
#         accuracy = accuracy_score(y_true=y_val, y_pred=y_pred)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

#         accs[trial] = accuracy
        f1s[trial] = f1
        trial += 1
        
    return np.mean(f1s)

et_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
et_study.optimize(et_objective, n_trials=N_TRIALS)

et_best = et_study.best_trial
et_best_params = et_best.params
print(f'\nscore: {et_best.value:.10f}\nparams: {et_best_params}')

task2_best_scores.append(et_best.value)
task2_best_params.append(et_best_params)

In [None]:
def gbr_objective(trial):
    param = {
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2500),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0)
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    scores = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        model = GradientBoostingClassifier(**param, verbose=0, random_state=RANDOM_STATE)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

        scores[trial] = f1
        trial += 1
        
    return np.mean(scores)

gbr_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
gbr_study.optimize(gbr_objective, n_trials=N_TRIALS)

gbr_best = gbr_study.best_trial
gbr_best_params = gbr_best.params
print(f'\nscore: {gbr_best.value:.10f}\nparams: {gbr_best_params}')

task2_best_scores.append(gbr_best.value)
task2_best_params.append(gbr_best_params)

In [None]:
def xgb_objective(trial):
    param = {
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.01, 10.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3500),
        'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.1, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    scores = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model = XGBClassifier(**param, n_jobs=-1, random_state=RANDOM_STATE, verbosity=0)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

        scores[trial] = f1
        trial += 1
        
    return np.mean(scores)

xgb_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
xgb_study.optimize(xgb_objective, n_trials=N_TRIALS)

xgb_best = xgb_study.best_trial
xgb_best_params = xgb_best.params
print(f'\nscore: {xgb_best.value:.10f}\nparams: {xgb_best_params}')

task2_best_scores.append(xgb_best.value)
task2_best_params.append(xgb_best_params)

In [None]:
def lgb_objective(trial):
    param = {
        'num_leaves': trial.suggest_int('num_leaves', 100, 3500),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3500),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    scores = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model = LGBMClassifier(**param, n_jobs=-1, random_state=RANDOM_STATE, verbose=-1)        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

        scores[trial] = f1
        trial += 1
        
    return np.mean(scores)

lgb_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
lgb_study.optimize(lgb_objective, n_trials=N_TRIALS)

lgb_best = lgb_study.best_trial
lgb_best_params = lgb_best.params
print(f'\nscore: {lgb_best.value:.10f}\nparams: {lgb_best_params}')

task2_best_scores.append(lgb_best.value)
task2_best_params.append(lgb_best_params)

In [None]:
def rf_objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 3500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
    }
    
    fold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    scores = np.zeros(shape=fold.get_n_splits())
    trial = 0
    for tr_idx, val_idx in fold.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model = RandomForestClassifier(**param, random_state=RANDOM_STATE)        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_true=y_val, y_pred=y_pred, average='weighted')

        scores[trial] = f1
        trial += 1
        
    return np.mean(scores)

rf_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
rf_study.optimize(rf_objective, n_trials=N_TRIALS)

rf_best = rf_study.best_trial
rf_best_params = rf_best.params
print(f'\nscore: {rf_best.value:.10f}\nparams: {rf_best_params}')

task2_best_scores.append(rf_best.value)
task2_best_params.append(rf_best_params)

In [None]:
best_index = np.argmax(task2_best_scores)
best_param = task2_best_params[best_index]
best_model_key = list(final_models.keys())[best_index]
best_model = final_models[best_model_key](**best_param, random_state=RANDOM_STATE)
print(f"Best model: {best_model.__class__.__name__}")

best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
test_accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
test_f1 = f1_score(y_true=y_test, y_pred=y_pred, average="weighted")
print(f"Test accuracy: {test_accuracy:.4f}\tTest F1: {test_f1:.4f}")