<p style="text-align: center"><img src="https://gitlab.aicrowd.com/aicrowd/assets/-/raw/master/challenges/clock-decomposition/notebook-banner.jpg?inline=false" alt="Drawing" style="height: 400px;"/></p>

# Install packages 🗃

Please add all pacakage installations in this section

In [None]:
!pip install numpy pandas lightgbm optuna scikit-Optimize imblearn

# Define preprocessing code 💻

The code that is common between the training and the prediction sections should be defined here. During evaluation, we completely skip the training section. Please make sure to add any common logic between the training and prediction sections here.

## Import common packages

Import packages that are common for training and prediction phases here.

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import joblib
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

## Fill in missing values
We define a function to fill in the missing values with different approaches.

In [None]:
# Define the function that fill in missing Value
def fillin_na(df):
    # fill in NA in column starts with missing digit with 1
    df.update(df.filter(regex='missing_digit', axis=1).fillna(1))

    # fill in NA in columns dist from cen & euc_dist_digit with mean
    for i in range(0, 12):
        df[str(i+1) + ' dist from cen'].fillna(df[str(i+1) +
                                                  ' dist from cen'].mean(), inplace=True)
        df['euc_dist_digit_' +
            str(i+1)].fillna(df['euc_dist_digit_' + str(i+1)].mean(), inplace=True)

    # fill in NA in column starts with area_digit / height_digit / width_digit with 0
    df.update(
        df.filter(regex=r'(area_digit|height_digit|width_digit)', axis=1).fillna(0))

    # fill value with -999
    ex_neg_col = ['variance_width', 'variance_height', 'variance_area', 'deviation_dist_from_mid_axis',
                  'between_axis_digits_angle_sum', 'between_axis_digits_angle_var', 'time_diff', 'centre_dot_detect']
    df[ex_neg_col] = df[ex_neg_col].fillna(-999)

    drop_col = ['between_digits_angle_ccw_sum', 'between_digits_angle_cw_sum',
                'between_digits_angle_ccw_var', 'sequence_flag_ccw', 'actual_hour_digit', 'actual_minute_digit']
    df[drop_col] = df[drop_col].fillna(-999)

    # fill value with 0
    fil_with_zero = ['number_of_digits', 'number_of_hands', 'hand_count_dummy',
                     'hour_hand_length', 'minute_hand_length', 'single_hand_length', 'clockhand_ratio', 'clockhand_diff',
                     'angle_between_hands', 'double_major', 'double_minor', 'vertical_dist', 'horizontal_dist']
    df[fil_with_zero] = df[fil_with_zero].fillna(0)

    # fill value with -1
    fil_with_one = ['deviation_from_centre', 'hour_proximity_from_11', 'minute_proximity_from_2',
                    'hour_pointing_digit', 'minute_pointing_digit', 'final_rotation_angle',
                    'top_area_perc', 'bottom_area_perc', 'left_area_perc', 'right_area_perc',
                    'between_digits_angle_cw_var', 'ellipse_circle_ratio', 'sequence_flag_cw', 'percentage_inside_ellipse']
    df[fil_with_one] = df[fil_with_one].fillna(-1)

    # Add missing value indicator
    df['time_diff_ind'] = df["time_diff"].apply(
        lambda x: 1 if math.isnan(x) else 0)
    df['centre_dot_detect_ind'] = df["centre_dot_detect"].apply(
        lambda x: 1 if math.isnan(x) else 0)

    # fill missing categorical variables
    df['intersection_pos_rel_centre'] = df['intersection_pos_rel_centre'].fillna(
        'Others')
    # OneHotEoncoding Categorical Variables
    df = pd.get_dummies(data=df, prefix=['intersec_dire'], columns=[
                        'intersection_pos_rel_centre'], drop_first=True)

    # drop unnecessary variables
    drop_col = ['between_digits_angle_ccw_sum', 'between_digits_angle_cw_sum',
                'between_digits_angle_ccw_var', 'sequence_flag_ccw', 'actual_hour_digit', 'actual_minute_digit']
    df = df.drop(drop_col, axis=1)

    return df

# Training phase ⚙️


## Load training data

In [None]:
# Load the data
train = pd.read_csv(AICROWD_DATASET_PATH.replace("validation", "train"))
# Fill in the NA
train = fillin_na(train)

In [None]:
train['diagnosis'].value_counts()

## Preprocessing - balance the data

### Simple undersample with random drop

In [None]:
# Pre-process the training data

# Simple Undersampling
train_us = pd.concat([
    train.loc[train.diagnosis == 'pre_alzheimer'],
    train.loc[train.diagnosis == 'post_alzheimer'],
    train.loc[train.diagnosis == 'normal'].sample(frac=1/6),
]).reset_index().drop('index', axis=1)

# Seperate the target variable
train_y = train_us[['diagnosis']].values.ravel()

# Label encode the target variable
label_encoder = LabelEncoder()
train_y = label_encoder.fit_transform(np.array(train_y))

# Get the no cat data
cat_col = ['row_id', 'diagnosis']
train_no_cat = train_us.drop(cat_col, axis=1)

### Oversample with SMOTE and undersample with random drop

In [None]:
''' Not good as Random Drop
# SMOTE
cat_col = ['row_id','diagnosis']
train_no_cat = train.drop(cat_col,axis=1)
train_y = train['diagnosis']

over = SMOTE(sampling_strategy={
    'normal':31208,
    'post_alzheimer':1149*5,
    'pre_alzheimer':420*5
})
under = RandomUnderSampler(sampling_strategy={
    'normal':int(31208/4), # keeping 25%
    'post_alzheimer':1149*5, # Keeping all of the samples we generated in the previous step
    'pre_alzheimer':420*5
})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
train_no_cat, train_y = pipeline.fit_resample(train_no_cat, train_y)

train_y = train_y.values.ravel()

# Label encode the target variable
label_encoder = LabelEncoder()
train_y = label_encoder.fit_transform(np.array(train_y))
'''

## Feature Importantce Test

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

model = XGBClassifier()
fs = SelectFromModel(model)
fs.fit(X_train, y_train)
# X_train_fs = fs.transform(X_train)

feature_idx = fs.get_support()
feature_name = X_train.columns[feature_idx]

print(feature_name)

## Hyper-parameter Tuning - LGB

In [None]:
'''
import optuna


def objective(trial):
   
    X_train, X_valid, y_train, y_valid = train_test_split(train_no_cat,train_y, test_size=0.2)
    
    
    param = {
        "objective": "multiclass",
        'num_class': 3, 
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 64, 256),
        'min_data_in_leaf': trial.suggest_int("min_data_in_leaf", 20, 100),  
        'learning_rate': trial.suggest_float("learning_rate", 1e-8, 1.0, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'is_unbalance': True,
        'verbosity': -1
    }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval  = lgb.Dataset(X_valid, y_valid, reference = lgb_train)
    model_lgb = lgb.train(param, lgb_train, 500, valid_sets=[lgb_eval], 
                    early_stopping_rounds=100)
    
    pred = model_lgb.predict(X_valid,num_iteration = model_lgb.best_iteration)
        
    logloss_lgb = log_loss(y_valid,pred)

    return logloss_lgb

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

'''

## LightGBM

In [None]:
# K- fold
X_train = train_no_cat
y_train = pd.DataFrame(train_y)

skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)

In [None]:
model_lgbs = []

params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'num_leaves': 128,
    'min_data_in_leaf': 100,
    'learning_rate': 0.04,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_gain_to_split': 0.2,
    'is_unbalance': True,
    'verbosity': -1
}

for fold, (itrain, ivalid) in enumerate(skf.split(X_train, y_train)):
    print("-"*40)
    print(f"Running for fold {fold}")
    lgb_train = lgb.Dataset(X_train.iloc[itrain], y_train.iloc[itrain])
    lgb_eval = lgb.Dataset(
        X_train.iloc[ivalid], y_train.iloc[ivalid], reference=lgb_train)
    model_lgb = lgb.train(params, lgb_train, 500, valid_sets=[lgb_eval],
                          early_stopping_rounds=100)

    model_lgbs.append(model_lgb)

## CatBoost

In [None]:
from catboost import CatBoostClassifier

model_cats = []

for fold, (itrain, ivalid) in enumerate(skf.split(X_train, y_train)):
    print("-"*40)
    print(f"Running for fold {fold}")
    model_cat = CatBoostClassifier()
    model_cat.fit(X_train.iloc[itrain], y_train.iloc[itrain].values.ravel())
    model_cats.append(model_cat)

## Model selection

In [None]:
# Load validation data
valid_data = pd.read_csv(AICROWD_DATASET_PATH)
valid_data = fillin_na(valid_data)
valid_no_cat = valid_data.drop(['row_id'], axis=1)

In [None]:
# Load validation labels
valid_y = pd.read_csv(AICROWD_DATASET_PATH.replace(
    "validation", "validation_ground_truth"))
valid_y = valid_y['diagnosis']
nb_folds = 5  # skf.n_splits

In [None]:
# lgb
lgb_preds = 0.0

for fold, model_lgb in enumerate(model_lgbs):
    print("-"*40)
    print(f"Running for fold {fold}")
    pred = model_lgb.predict(
        valid_no_cat, num_iteration=model_lgb.best_iteration)
    lgb_preds += pred/nb_folds

In [None]:
# catBoost

cat_preds = 0.0

for fold, model_cat in enumerate(model_cats):
    print("-"*40)
    print(f"Running for fold {fold}")
    pred = model_cat.predict_proba(valid_no_cat)
    cat_preds += pred/nb_folds

In [None]:
logloss_lgb = log_loss(valid_y, lgb_preds)
logloss_cat = log_loss(valid_y, cat_preds)

print("LGB logloss: " + str(logloss_lgb))
print("Catboost logloss: " + str(logloss_cat))

## Save the trained model
We decided to proceed with LightGBM.

In [None]:
# # Save model
for i, model_lgb in enumerate(model_lgbs):
    model_filename = f'{AICROWD_ASSETS_DIR}/model_lgb_fold_{i}.pkl'
    joblib.dump(model_lgb, model_filename)

# Prediction phase 🔎

Please make sure to save the weights from the training section in your assets directory and load them in this section

In [None]:
nb_folds = 5  # skf.n_splits
clfs = []

for fold in range(nb_folds):
    print("-"*40)
    print(f"Running for fold {fold}")
    model_filename = f'{AICROWD_ASSETS_DIR}/model_lgb_fold_{fold}.pkl'

    clf = joblib.load(model_filename)
    clfs.append(clf)

## Load test data

In [None]:
test_data = pd.read_csv(AICROWD_DATASET_PATH)
test_data = fillin_na(test_data)
test_no_cat = test_data.drop(['row_id'], axis=1)

## Generate predictions

In [None]:
preds = 0.0
for fold, clf in enumerate(clfs):
    print("-"*40)
    print(f"Running for fold {fold}")
    pred = clf.predict(test_no_cat, num_iteration=clf.best_iteration)
    preds += pred/nb_folds

In [None]:
predictions = {
    "row_id": test_data["row_id"].values,
    "normal_diagnosis_probability": preds[:, 0],
    "post_alzheimer_diagnosis_probability": preds[:, 1],
    "pre_alzheimer_diagnosis_probability": preds[:, 2],
}

predictions_df = pd.DataFrame.from_dict(predictions)

## Save predictions 📨

In [None]:
predictions_df.to_csv(AICROWD_PREDICTIONS_PATH, index=False)

# Submit to AIcrowd 🚀