In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from pycaret.classification import setup, evaluate_model, compare_models, plot_model, add_metric

import optuna

# Load data

In [2]:
train = pd.read_csv('train.csv', index_col='id')
original = pd.read_csv('original.csv')
test = pd.read_csv('test.csv', index_col='id')

# Combine Train with Original

In [3]:
original.prognosis = original.prognosis.str.replace(' ', '_')
train_final = pd.concat([train, original])

# Split Features And Target

In [4]:
X = train_final.drop('prognosis', axis=1)
y = train_final.prognosis

# Feature Engineering

In [5]:
n_components = 7
pca_column_names = [f"PCA{i+1}" for i in range(n_components)]
pca = PCA(n_components=n_components)
X = pd.DataFrame(pca.fit_transform(X), columns=pca_column_names)

# Target transformation

In [6]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [7]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# Model Selection

In [8]:
_ = setup(data=train, target='prognosis')

Unnamed: 0,Description,Value
0,Session id,4893
1,Target,prognosis
2,Target type,Multiclass
3,Target mapping,"Chikungunya: 0, Dengue: 1, Japanese_encephalitis: 2, Lyme_disease: 3, Malaria: 4, Plague: 5, Rift_Valley_fever: 6, Tungiasis: 7, West_Nile_fever: 8, Yellow_Fever: 9, Zika: 10"
4,Original data shape,"(707, 65)"
5,Transformed data shape,"(707, 65)"
6,Transformed train set shape,"(494, 65)"
7,Transformed test set shape,"(213, 65)"
8,Numeric features,64
9,Preprocess,True


In [9]:
add_metric(id='mapk', name='MAPK', score_func=mapk)

Name                                                  MAPK
Display Name                                          MAPK
Score Function       <function mapk at 0x0000016B1A62BEE8>
Scorer                                   make_scorer(mapk)
Target                                                pred
Args                                                    {}
Greater is Better                                     True
Multiclass                                            True
Custom                                                True
Name: mapk, dtype: object

In [10]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,MAPK,TT (Sec)
et,Extra Trees Classifier,0.31,0.7361,0.31,0.3157,0.2944,0.2392,0.2418,0.4121,0.309
lda,Linear Discriminant Analysis,0.3079,0.7221,0.3079,0.3035,0.2894,0.2371,0.2399,0.4149,0.249
ridge,Ridge Classifier,0.3017,0.0,0.3017,0.2884,0.278,0.2294,0.2336,0.4035,0.247
xgboost,Extreme Gradient Boosting,0.3017,0.7315,0.3017,0.2961,0.2847,0.2307,0.2334,0.4023,0.48
gbc,Gradient Boosting Classifier,0.2957,0.7194,0.2957,0.2967,0.2787,0.2227,0.225,0.4046,0.518
rf,Random Forest Classifier,0.2956,0.7428,0.2956,0.2747,0.274,0.2222,0.2245,0.3996,0.3
lr,Logistic Regression,0.2856,0.7192,0.2856,0.292,0.2747,0.2125,0.215,0.3886,0.642
lightgbm,Light Gradient Boosting Machine,0.2734,0.7263,0.2734,0.2504,0.2517,0.1993,0.2016,0.3785,0.518
knn,K Neighbors Classifier,0.2733,0.658,0.2733,0.2319,0.2265,0.1996,0.2076,0.3795,0.272
svm,SVM - Linear Kernel,0.2451,0.0,0.2451,0.2568,0.2304,0.1668,0.1706,0.3513,0.261


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

# Hyperparameters Tuning

In [11]:
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

In [12]:
def lda(trial):
    solver = trial.suggest_categorical('solver', ['lsqr', 'eigen'])
    tol = trial.suggest_loguniform('tol', 1e-8, 10.0)
          
    model = LinearDiscriminantAnalysis(
        solver=solver,
        tol=tol
    )
    
    results = []
    
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        results.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    
    return np.mean(results)

In [13]:
def ridge(trial):
    alpha = trial.suggest_int('alpha', 0, 1000)
    tol = trial.suggest_loguniform('tol', 1e-8, 10.0)
        
    model = RidgeClassifier(
        alpha=alpha,
        tol=tol
    )
    calibrated_model = CalibratedClassifierCV(model, method='sigmoid')
    
    results = []
    
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        calibrated_model.fit(X_train, y_train)
        y_pred = calibrated_model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        results.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    
    return np.mean(results)

In [14]:
def lr(trial):
    C = trial.suggest_loguniform('C', 0.001, 1000)
    solver = trial.suggest_categorical('solver', ['newton-cg', 'liblinear', 'sag', 'saga'])
    
    if solver=='sag' or solver=='newton-cg':
        penalty = 'l2'
        multi_class = trial.suggest_categorical('multi_class', ['ovr', 'multinomial'])
    elif solver=='liblinear':
        multi_class = 'ovr'
        penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
        if penalty==1:
            l1_ratio = trial.suggest_float(0, 1)
    elif solver=='saga':
        multi_class = trial.suggest_categorical('multi_class', ['ovr', 'multinomial'])
        penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
        if penalty==1:
            l1_ratio = trial.suggest_float(0, 1)
          
    model = LogisticRegression(
        C=C,
        penalty = penalty,
        solver=solver,
        multi_class=multi_class
    )
    
    results = []
    
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        results.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    
    return np.mean(results)

In [15]:
def rf(trial):
    max_depth = trial.suggest_int('max_depth', 1, 100)
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    random_state = 0
          
    model = RandomForestClassifier(
        max_depth=max_depth,
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        random_state=random_state
    )
    
    results = []
    
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        results.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    
    return np.mean(results)

In [16]:
def xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-10, 1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-10, 1),
        'gamma': trial.suggest_loguniform('gamma', 1e-10, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'eval_metric': 'mlogloss',
        'random_state': 0,
        'n_jobs': -1
    }
    
    model = XGBClassifier(**params)

    results = []
    
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        results.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    
    return np.mean(results)

In [17]:
def et(trial):
    max_depth = trial.suggest_int('max_depth', 1, 100)
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])
    random_state = 0

          
    model = ExtraTreesClassifier(
        max_depth=max_depth,
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=random_state
    )
    
    results = []
    
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        results.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    
    return np.mean(results)

In [18]:
def gbc(trial):
    #tol = trial.suggest_loguniform('tol', 1e-8, 10.0)
    max_depth = trial.suggest_int('max_depth', 1, 50)
    learning_rate = trial.suggest_loguniform('learning_rate', .001, 1)
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 100)
    random_state = 0
          
    model = GradientBoostingClassifier(
        #tol=tol,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
        random_state=random_state
    )
    
    results = []
    
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        results.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    
    return np.mean(results)

In [19]:
def lightgbm(trial):
    params = {
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'metric': 'multi_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'num_leaves': trial.suggest_int('num_leaves', 2, 100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-9, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-9, 10.0),
        'random_state': 0
    }

    model = LGBMClassifier(**params)

    results = []
    
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        results.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    
    return np.mean(results)

In [20]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(lda, n_trials=100)
# study.best_params

[32m[I 2023-04-28 23:26:03,608][0m A new study created in memory with name: no-name-7fa36a06-e35b-4ec4-88c1-18f9294e41d4[0m


{'solver': 'lsqr', 'tol': 2.5966562393908653e-05}

In [21]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(ridge, n_trials=100)
# study.best_params

{'alpha': 486, 'tol': 0.0014398635972594188}

In [22]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(lr, n_trials=100)
# study.best_params

{'C': 215.45060185134096,
 'solver': 'saga',
 'multi_class': 'multinomial',
 'penalty': 'l2'}

In [23]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(rf, n_trials=100)
# study.best_params

{'max_depth': 18,
 'n_estimators': 446,
 'min_samples_leaf': 3,
 'min_samples_split': 41}

In [24]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(xgb, n_trials=100)
# study.best_params

{'n_estimators': 512,
 'max_depth': 7,
 'learning_rate': 0.005243083046118347,
 'subsample': 0.4982751538216017,
 'colsample_bytree': 0.7694953809888724,
 'reg_alpha': 0.027504512835287074,
 'reg_lambda': 0.012682982755744632,
 'gamma': 1.6811767926664402e-10,
 'min_child_weight': 9}

In [25]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(et, n_trials=100)
# study.best_params

{'max_depth': 48,
 'n_estimators': 44,
 'min_samples_leaf': 5,
 'min_samples_split': 74,
 'max_features': None}

In [26]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(gbc, n_trials=100)
# study.best_params

{'max_depth': 2,
 'learning_rate': 0.009155436031388185,
 'n_estimators': 415,
 'min_samples_leaf': 74,
 'max_leaf_nodes': 10}

In [27]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(lightgbm, n_trials=100)
# study.best_params

{'n_estimators': 209,
 'max_depth': 47,
 'num_leaves': 46,
 'learning_rate': 0.015615796678285741,
 'min_child_samples': 97,
 'subsample': 0.5839657165946931,
 'colsample_bytree': 0.7748930946250282,
 'reg_alpha': 2.309901295457889,
 'reg_lambda': 1.6941870192099517e-07}

In [29]:
lda_params = {
    'solver': 'lsqr', 
    'tol': 2.5966562393908653e-05
}

ridge_params = {
    'alpha': 486, 
    'tol': 0.0014398635972594188
}

lr_params = {
    'C': 215.45060185134096,
    'solver': 'saga',
    'multi_class': 'multinomial',
    'penalty': 'l2'
}

rf_params = {
    'max_depth': 18,
    'n_estimators': 446,
    'min_samples_leaf': 3,
    'min_samples_split': 41
}

xgb_params = {
    'n_estimators': 512,
    'max_depth': 7,
    'learning_rate': 0.005243083046118347,
    'subsample': 0.4982751538216017,
    'colsample_bytree': 0.7694953809888724,
    'reg_alpha': 0.027504512835287074,
    'reg_lambda': 0.012682982755744632,
    'gamma': 1.6811767926664402e-10,
    'min_child_weight': 9,
    'eval_metric': 'mlogloss'
}

et_params = {
    'max_depth': 48,
    'n_estimators': 44,
    'min_samples_leaf': 5,
    'min_samples_split': 74,
    'max_features': None
}

gbc_params = {
    'max_depth': 2,
    'learning_rate': 0.009155436031388185,
    'n_estimators': 415,
    'min_samples_leaf': 74,
    'max_leaf_nodes': 10
}

lightgbm_params = {
    'n_estimators': 209,
    'max_depth': 47,
    'num_leaves': 46,
    'learning_rate': 0.015615796678285741,
    'min_child_samples': 97,
    'subsample': 0.5839657165946931,
    'colsample_bytree': 0.7748930946250282,
    'reg_alpha': 2.309901295457889,
    'reg_lambda': 1.6941870192099517e-07
}

In [30]:
# lda_params = {
#     'solver': 'eigen', 
#     'tol': 2.7614720614865225e-06
# }

# ridge_params = {
#     'alpha': 365, 
#     'tol': 0.42705446738501834
# }

# rf_params = {
#     'max_depth': 27,
#     'n_estimators': 430,
#     'min_samples_leaf': 1,
#     'min_samples_split': 28
# }

# xgb_params = {
#     'n_estimators': 993,
#     'max_depth': 5,
#     'learning_rate': 0.005433316626408222,
#     'subsample': 0.7946220685545012,
#     'colsample_bytree': 0.23406190765836787,
#     'reg_alpha': 5.571859953931e-07,
#     'reg_lambda': 4.8441369828589075e-08,
#     'gamma': 1.0396193273787708e-08,
#     'min_child_weight': 1,
#     'eval_metric': 'mlogloss'
# }

# et_params = {
#     'max_depth': 100,
#     'n_estimators': 376,
#     'min_samples_leaf': 1,
#     'min_samples_split': 29,
#     'max_features': 'log2'
# }

# gbc_params = {
#     'max_depth': 24,
#     'learning_rate': 0.007593638696418762,
#     'n_estimators': 335,
#     'min_samples_leaf': 74,
#     'max_leaf_nodes': 71
# }

In [31]:
lda_model = LinearDiscriminantAnalysis(**lda_params)
ridge_model = CalibratedClassifierCV(RidgeClassifier(**ridge_params), method='sigmoid')
lr_model = LogisticRegression(**lr_params)
rf_model = RandomForestClassifier(**rf_params)
xgb_model = XGBClassifier(**xgb_params)
et_model = ExtraTreesClassifier(**et_params)
gbc_model = GradientBoostingClassifier(**gbc_params)
lightgbm_model = LGBMClassifier(**lightgbm_params)

In [32]:
models = {
    'lda': lda_model, 
    'ridge': ridge_model, 
    'lr': lr_model,
    'rf': rf_model,
    'xgb': xgb_model,
    'et': et_model,
    'gbc': gbc_model,
    'lightgbm': lightgbm_model,
}

# Models Evaluation

In [33]:
results_ensemble_models = {}

for name, model in models.items():
    res=[]
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
        original_shape = sorted_pred_idx.shape
        top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
        top3_pred = top3_pred.reshape(original_shape)
        res.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))
    results_ensemble_models[name] = res

In [34]:
for name, result in results_ensemble_models.items():
    print("----------\n" + name)
    print(np.mean(result))
    print(np.std(result))

----------
lda
0.45621710526315795
0.034843343941689364
----------
ridge
0.45376949317738796
0.03394711707311028
----------
lr
0.46122563352826507
0.03968868161192514
----------
rf
0.46362694931773873
0.04262901111868093
----------
xgb
0.46893031189083817
0.03644729155288192
----------
et
0.4709923245614035
0.03802588560019045
----------
gbc
0.4605488547758284
0.030948522081471686
----------
lightgbm
0.45349597953216375
0.025902998573274823


# Bagging Ensemble

In [33]:
# final_model = VotingClassifier(estimators=[('lda', lda_model), 
#                                            ('ridge', ridge_model), 
#                                            ('lr', lr_model),
#                                            ('rf', rf_model),
#                                            ('xgb', xgb_model),
#                                            ('et', et_model),
#                                            ('gbc', gbc_model), 
#                                            ('lightgbm', lightgbm_model)], 
#                                voting='soft')

In [35]:
final_model = VotingClassifier(estimators=[('xgb', xgb_model),
                                           ('et', et_model)], 
                               voting='soft')

results_ensemble = []
    
for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    final_model.fit(X_train, y_train)
    y_pred = final_model.predict_proba(X_test)

    sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
    original_shape = sorted_pred_idx.shape
    top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
    top3_pred = top3_pred.reshape(original_shape)
    results_ensemble.append(mapk(y_test.reshape(-1, 1), sorted_pred_idx, k=3))

print(np.mean(results_ensemble))

0.47


# Model Training

In [36]:
final_model.fit(X, y)

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.7694953809888724,
                                            enable_categorical=False,
                                            eval_metric='mlogloss',
                                            gamma=1.6811767926664402e-10,
                                            gpu_id=-1, importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.005243083046118347,
                                            max_delta_step=0, max_de...
                                                   class_weight=None,
                                                   criterion='gini',
                       

# Prediction

In [37]:
final_predictions = final_model.predict_proba(pd.DataFrame(pca.fit_transform(test), columns=pca_column_names))

sorted_pred_idx = np.argsort(-final_predictions, axis=1)[:,:3]
original_shape = sorted_pred_idx.shape
top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
top3_pred = top3_pred.reshape(original_shape)

# Submission

In [38]:
submission = pd.read_csv('sample_submission.csv')
submission['prognosis'] = np.apply_along_axis(lambda x: np.array(' '.join(x), dtype="object"), 1, top3_pred)
submission.to_csv('submission_bagging_model_with_original_pca.csv', columns=['id', 'prognosis'], index=False)