In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from lightgbm import LGBMClassifier

from pycaret.classification import setup, evaluate_model, compare_models, plot_model, add_metric

import optuna

# Load data

In [2]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')

# Brief EDA

In [3]:
train.shape

(707, 65)

In [4]:
test.shape

(303, 64)

In [5]:
train.describe()

Unnamed: 0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,lymph_swells,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash
count,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,...,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0
mean,0.503536,0.449788,0.459689,0.487977,0.51768,0.449788,0.441301,0.487977,0.390382,0.393211,...,0.148515,0.072136,0.097595,0.079208,0.084866,0.154173,0.144272,0.137199,0.032532,0.031117
std,0.500341,0.497825,0.498725,0.500209,0.500041,0.497825,0.496894,0.500209,0.488181,0.488809,...,0.355861,0.258896,0.296977,0.270254,0.278879,0.36137,0.351614,0.344301,0.177533,0.173758
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
train.head()

Unnamed: 0_level_0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lyme_disease
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tungiasis
2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,Lyme_disease
3,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zika
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,Rift_Valley_fever


In [7]:
target = train.prognosis
X = train.drop('prognosis', axis=1)
target.value_counts()

West_Nile_fever          85
Japanese_encephalitis    81
Tungiasis                70
Rift_Valley_fever        70
Chikungunya              66
Dengue                   63
Yellow_Fever             61
Zika                     58
Plague                   53
Lyme_disease             52
Malaria                  48
Name: prognosis, dtype: int64

# Target transformation

In [8]:
encoder = LabelEncoder()
y = encoder.fit_transform(target)

In [9]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [10]:
_ = setup(data=train, target='prognosis')

Unnamed: 0,Description,Value
0,Session id,5462
1,Target,prognosis
2,Target type,Multiclass
3,Target mapping,"Chikungunya: 0, Dengue: 1, Japanese_encephalitis: 2, Lyme_disease: 3, Malaria: 4, Plague: 5, Rift_Valley_fever: 6, Tungiasis: 7, West_Nile_fever: 8, Yellow_Fever: 9, Zika: 10"
4,Original data shape,"(707, 65)"
5,Transformed data shape,"(707, 65)"
6,Transformed train set shape,"(494, 65)"
7,Transformed test set shape,"(213, 65)"
8,Numeric features,64
9,Preprocess,True


In [13]:
add_metric(id='mapk', name='MAPK', score_func=mapk)

Name                                                  MAPK
Display Name                                          MAPK
Score Function       <function mapk at 0x000001EDF0D03048>
Scorer                                   make_scorer(mapk)
Target                                                pred
Args                                                    {}
Greater is Better                                     True
Multiclass                                            True
Custom                                                True
Name: mapk, dtype: object

In [15]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,MAPK,TT (Sec)
rf,Random Forest Classifier,0.3157,0.7453,0.3157,0.3226,0.2974,0.2429,0.246,0.4204,0.15
et,Extra Trees Classifier,0.3136,0.7358,0.3136,0.3151,0.2956,0.2415,0.2441,0.4177,0.143
gbc,Gradient Boosting Classifier,0.2977,0.7267,0.2977,0.2827,0.2819,0.2239,0.2254,0.4078,0.476
lightgbm,Light Gradient Boosting Machine,0.2955,0.7303,0.2955,0.2933,0.2784,0.2221,0.2242,0.4037,0.477
lda,Linear Discriminant Analysis,0.2914,0.7098,0.2914,0.2797,0.2677,0.2192,0.2222,0.405,0.02
lr,Logistic Regression,0.2835,0.7102,0.2835,0.274,0.2647,0.2094,0.2117,0.3983,0.412
ridge,Ridge Classifier,0.2773,0.0,0.2773,0.2538,0.2435,0.2017,0.2054,0.3846,0.013
xgboost,Extreme Gradient Boosting,0.2734,0.73,0.2734,0.2541,0.251,0.1984,0.2004,0.3854,0.356
knn,K Neighbors Classifier,0.2671,0.6539,0.2671,0.2351,0.2214,0.1937,0.2009,0.3752,0.074
dt,Decision Tree Classifier,0.2611,0.5917,0.2611,0.2704,0.2534,0.1838,0.1858,0.3736,0.016


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [15]:
y_pred = rf.predict_proba(test)
    
sorted_pred_idx = np.argsort(-y_pred, axis=1)[:,:3]
original_shape = sorted_pred_idx.shape
top3_pred = encoder.inverse_transform(sorted_pred_idx.reshape(-1,1))
top3_pred = top3_pred.reshape(original_shape)

In [None]:
test['prognosis'] = np.apply_along_axis(lambda x: np.array(' '.join(x), dtype="object"), 1, top3_pred)

# Submission

In [None]:
test.to_csv('submission_base_model.csv', columns=['prognosis'])