# Table of contents
1. [Data Preparation](#dprep)
2. [Exploratory Data Analysis](#expda)
5. [Feature Engineering](#fe)
4. [Data Cleansing](#dclean)
5. [Modeling](#model)
6. [Evaluation](#eval)

## 1. Data Preparation <a name="dprep"></a>

In [None]:
# Import seluruh library yang diperlukan
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from datetime import datetime
from imblearn.over_sampling import RandomOverSampler 
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt

In [None]:
# Data Preparation
def load_data(main_path, diag_path, proc_path):
    main = pd.read_csv(main_path)
    diag = pd.read_csv(diag_path)
    proc = pd.read_csv(proc_path)

    return main, diag, proc

In [None]:
df_main, df_diag, df_proc = load_data('sampling_healtkathon2022/sampling_healtkathon2022.csv', 
                                    'sampling_healthkathon2022_diagnosa/sampling_healthkathon2022_diagnosa.csv', 
                                    'sampling_healthkathon2022_procedure/sampling_healthkathon2022_procedure.csv')

## 2. Exploratory Data Analysis <a name="expda"></a>

#### Mencari persebaran 0 dan 1

In [None]:
label_dist = df_main.groupby('label').size()

In [None]:
plt.bar(['0','1'],label_dist, align='center', alpha=0.5)
plt.show()

#### Mencari jumlah NA di setiap kolom

In [None]:
na_main = df_main.isna().sum()

In [None]:
na_main

## 3. Feature Engineering <a name="fe"></a>

In [None]:
# Ngambil occuring

def merge_main_diag_proc(main, diag, proc):
    # Occur Diagnosa
    occur = pd.DataFrame()
    occur = occur.assign(occur_diagnosis = diag.groupby('id').size()) 
    gabungan_diag = main.merge(occur, on='id', how='left')

    # Occur Procedure
    occur = pd.DataFrame()
    occur = occur.assign(occur_procedure = proc.groupby('id').size()) 
    gabungan_final = gabungan_diag.merge(occur, on='id', how='left')
    
    return gabungan_final

In [None]:
df_merged = merge_main_diag_proc(df_main,df_diag,df_proc)

In [None]:
df_merged

In [None]:
def drop_columns (merged):
    merged = merged.drop(columns=['id'])
    merged = merged.drop(columns=['id_peserta'])
    merged['biaya_bagi100'] = merged['biaya']/100
    merged = merged.drop(columns=['biaya'])

    return merged

In [None]:
merged_dropped = drop_columns(df_merged)

In [None]:
merged_dropped

In [None]:
# Menghitung selisih antar tanggal

def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    return abs((d2 - d1).days)

def process_difference (merged_dropped):
    merged_dropped["Selisih"] = " "

    for i in range(len(merged_dropped)) :
        
        x = merged_dropped.iloc[i]['tgldatang']
        y = merged_dropped.iloc[i]['tglpulang']

        if x == y :
            merged_dropped.at[i,'Selisih'] = 0

        else :

            TanggalX = x[0:10]
            TanggalY = y[0:10]

            Selisih = days_between(TanggalX, TanggalY)
            merged_dropped.at[i, 'Selisih'] = Selisih

    merged_dropped = merged_dropped.drop(columns=['tgldatang'])
    merged_dropped = merged_dropped.drop(columns=['tglpulang'])

    return merged_dropped


In [None]:
with_selisih = process_difference(merged_dropped)

## 4. Data Cleansing <a name="dclean"></a>

In [None]:
def process_na(no_na):
    no_na['jenkel'] = no_na['jenkel'].fillna(no_na['jenkel'].mode()[0])
    no_na['pisat'] = no_na['pisat'].fillna(no_na['pisat'].mode()[0])
    no_na['diagfktp'] = no_na['diagfktp'].fillna(no_na['diagfktp'].mode()[0])
    no_na['jenispulang'] = no_na['jenispulang'].fillna(no_na['jenispulang'].mode()[0])
    no_na['occur_procedure'] = no_na['occur_procedure'].fillna(0)
    no_na['occur_procedure'] = no_na['occur_procedure'].fillna(0)


    no_na['politujuan'] = no_na['politujuan'].fillna('ZZZ')
    no_na['kdsa'] = no_na['kdsa'].fillna('ZZZ')
    no_na['kdsp'] = no_na['kdsp'].fillna('ZZZZ')
    no_na['kdsr'] = no_na['kdsr'].fillna('ZZZZZ')
    no_na['kdsi'] = no_na['kdsi'].fillna('ZZZZZZ')
    no_na['kdsd'] = no_na['kdsd'].fillna('ZZZZZZZ')

    return no_na

In [None]:
cleanril = process_na(with_selisih)

cleanril

In [None]:
cleanril.to_csv('clean_final.csv', index=False)

In [None]:
# ==================== CHECKPOINT ====================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from datetime import datetime
from imblearn.over_sampling import RandomOverSampler 
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error as mse

clean = pd.read_csv('clean_final.csv')

In [None]:
Pred = clean

In [None]:
clean.info()

In [None]:
# Pembuatan data training
def convert_to_train(clean):
    satufull = clean.loc[clean['label'] == 1] # Pengambilan yang labelnya 1
    nolfull = clean.loc[clean['label'] == 0] # Pengambilan yang label 0

    jumsat = int(len(satufull))
    jumnol = int(len(nolfull)/3)
    satu = satufull.sample(n=(jumsat))
    nol = nolfull.sample(n=(jumnol)) # Pengambilan label 0 sejumlah banyaknya label 1

    Train = nol.append(satu)
    Train = Train.sample(frac = 1)

    X = Train.drop(columns=['label'])
    y = Train.label

    X_train_pre, X_test, y_train_pre, y_test = train_test_split(X, y,stratify=y, test_size=0.01, random_state=42)

    oversample = RandomOverSampler(sampling_strategy=0.08)

    X_train, y_train = oversample.fit_resample(X_train_pre, y_train_pre)

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = convert_to_train(clean)

## 5. Modeling <a name="model"></a>

In [None]:
def model_fitting(X_train, X_test, y_train, y_test):
    catboost_model = CatBoostClassifier(n_estimators=700,
                        loss_function='CrossEntropy',
                        learning_rate=0.4375,
                        depth=4, task_type='GPU',
                        random_state=1,
                        verbose=False)

    pool_train = Pool(X_train, y_train,
                    cat_features = ['typefaskes', 'jenkel', 'politujuan', 'diagfktp', 'cbg', 'kdsa', 'kdsp', 'kdsr', 'kdsi', 'kdsd'])
                    
    pool_test = Pool(X_test, cat_features = ['typefaskes', 'jenkel', 'politujuan', 'diagfktp', 'cbg', 'kdsa', 'kdsp', 'kdsr', 'kdsi', 'kdsd'])

    catboost_model.fit(pool_train)
    y_pred = catboost_model.predict(pool_test)
    cb_rmse = np.sqrt(mse(y_test, y_pred))
    print("RMSE:", np.mean(cb_rmse))

    return catboost_model

In [None]:
catboost_model = model_fitting(X_train, X_test, y_train, y_test)

In [None]:
# Predict
Hasilpred = catboost_model.predict(Pred)

## 6. Evaluation <a name="eval"></a>

In [None]:
# Evaluation
def evaluate(clean, Hasilpred):
    y_true = clean.label
    y_pred = Hasilpred  # List of your dataframes


    def check(true,pred)  :
        tn, fp, fn, tp = confusion_matrix(true, pred).ravel()
        Accuracy = (tn+tp) / (tn+fp+tp+fn)
        Precision = tp/(tp+fp)
        Recall = tp/(tp+fn)
        Specifity = tn/(tn+fp)  

        print("Accuracy    :", Accuracy, "\nPrecision   :", Precision, "\nRecall      :", Recall, "\nSpecifity   :", Specifity)

    check(y_true,y_pred)

    print("ROC AUC     :", roc_auc_score(y_true, y_pred))

In [None]:
evaluate(clean, Hasilpred)

In [None]:
# =================== START NEW CSV ===================

def pipeline_pred(main_path, diag_path, proc_path):
    df_main, df_diag, df_proc = load_data(main_path, diag_path, proc_path)
    df_merged = merge_main_diag_proc(df_main,df_diag,df_proc)
    merged_dropped = drop_columns(df_merged)
    with_selisih = process_difference(merged_dropped)
    pred = process_na(with_selisih)

    return pred

In [None]:
pred_real = pipeline_pred('Pred/sampling2_healthkathon2022_sep.csv', 
                            'Pred/sampling2_healthkathon2022_diagnosa.csv',
                            'Pred/sampling2_healthkathon_2022_procedure.csv')
pred_real

In [None]:
pred_real.to_csv('pred_clean.csv', index=False)

In [None]:
hasil_pred = catboost_model.predict(pred_real)

In [None]:
original = pd.read_csv('Pred/sampling2_healthkathon2022_sep.csv')

In [None]:
answer = pd.DataFrame()
answer['id']= original['id']
answer['label'] = hasil_pred

print(answer)

# Save ke csv
answer.to_csv('answer.csv')