## IMPORT LIBRARIES

In [1]:
# data wrangling 

import pandas as pd
import numpy as np
import re

# feature engineering

import scipy.stats as ss
import statsmodels.api as sm
from scipy.stats import anderson, mannwhitneyu, chi2_contingency
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score

# models

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.neighbors import KernelDensity
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

# model evaluation

from yellowbrick.model_selection import learning_curve, ValidationCurve
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, auc, accuracy_score, f1_score, log_loss, precision_score, recall_score
from sklearn.datasets import make_classification

# filter warning

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=FutureWarning)

## OVERVIEW

In [2]:
# load data

df = pd.read_csv('data_hujan_full.csv')

In [3]:
# show top 5

df.head()

Unnamed: 0,convective_available_potential_energy,k_index,cross_totals_index,vertical_totals_index,lifted_index,showalter_index,temp_1000_hpa,dewpoint_1000_hpa,temp_850_hpa,dewpoint_850_hpa,temp_700_hpa,dewpoint_700_hpa,temp_500_hpa,dewpoint_500_hpa,rain
0,508.22,35.4,20.1,22.9,-1.48,0.16,27.0,22.2,18.6,15.8,10.2,6.9,-4.3,11.3,0
1,508.22,35.4,20.1,22.9,-1.48,0.16,27.0,22.2,18.6,15.8,10.2,6.9,-4.3,11.3,0
2,952.96,32.2,19.2,22.1,-2.35,1.33,26.2,23.5,18.0,15.1,11.0,6.0,-4.1,26.1,0
3,952.96,32.2,19.2,22.1,-2.35,1.33,26.2,23.5,18.0,15.1,11.0,6.0,-4.1,26.1,0
4,276.15,33.7,17.1,23.1,-0.87,2.82,26.2,22.1,19.0,13.0,9.0,6.6,-4.1,34.1,0


In [4]:
# check info on initial data

pd.DataFrame({'Features' : df.columns, 'dataType' : df.dtypes.values, 
              'nullvalues' : [df[i].isna().sum() for i in df.columns],
              'unique' : [df[i].nunique() for i in df.columns],
              'uniqueSample' : [list(pd.Series(df[i].unique()).sample()) for i in df.columns]}).reset_index(drop = True)

Unnamed: 0,Features,dataType,nullvalues,unique,uniqueSample
0,convective_available_potential_energy,float64,0,967,[49.39]
1,k_index,float64,0,303,[-6.5]
2,cross_totals_index,float64,0,135,[22.3]
3,vertical_totals_index,float64,0,42,[21.7]
4,lifted_index,float64,0,644,[0.61]
5,showalter_index,float64,0,645,[-0.77]
6,temp_1000_hpa,float64,0,34,[27.0]
7,dewpoint_1000_hpa,float64,0,95,[23.8]
8,temp_850_hpa,float64,0,33,[20.8]
9,dewpoint_850_hpa,float64,0,135,[5.4]


## FEATURE ENGINEERING

In [5]:
# Split target features

X = df.drop(['rain'], axis=1)
y = df['rain']

In [6]:
# show X & y data

print(X,y)

      convective_available_potential_energy  k_index  cross_totals_index  \
0                                    508.22     35.4                20.1   
1                                    508.22     35.4                20.1   
2                                    952.96     32.2                19.2   
3                                    952.96     32.2                19.2   
4                                    276.15     33.7                17.1   
...                                     ...      ...                 ...   
2539                                 236.40     34.5                21.5   
2540                                 706.46     30.5                19.1   
2541                                 706.46     30.5                19.1   
2542                                 119.45     32.4                18.8   
2543                                 119.45     32.4                18.8   

      vertical_totals_index  lifted_index  showalter_index  temp_1000_hpa  \
0         

## PARAMETER TUNING

In [15]:
# random forest with hyperparameter model build + pipelining

RFC_pipe  = Pipeline([('scale', RobustScaler()),
                     ('clf', RandomForestClassifier())])

RFC_param = {'clf__max_depth': [10,20,40,'None'], 
             'clf__min_samples_leaf' : [2,4,8,10],
             'clf__min_samples_split' : [2,10,100,500],
             'clf__n_estimators' : [100,500,1000]}

RSCV_RFC  = RandomizedSearchCV(RFC_pipe, RFC_param, cv=10, scoring='f1')

In [16]:
# xgboost with hyperparameter model build + pipelining

XGB_pipe  = Pipeline([('scale', RobustScaler()),
                     ('clf', XGBClassifier())])

XGB_param = {'clf__max_depth': [3,5,10], 
             'clf__gamma' : [1,2],
             'clf__reg_alpha' : [40,180],
             'clf__reg_lambda' : [0,1],
             'clf__colsample_bytree' : [0.5,1], 
             'clf__min_child_weight' : [0,10,1], 
             'clf__n_estimators' : [180, 200, 500]}

RSCV_XGB  = RandomizedSearchCV(XGB_pipe, XGB_param, cv=10, scoring='f1')

In [19]:
# split train & test data

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=101)

# oversampled data

X_train_sm, y_train_sm = SMOTE(random_state=42).fit_sample(X_train, y_train)

In [20]:
# check score

models = ['Random Forest', 'XGBoost']
pipes = [RSCV_RFC, RSCV_XGB]
for model, pipe in zip(models, pipes):
    print(model, '\n')
    pipe.fit(X_train_sm, y_train_sm)
    print('Best Score : ', pipe.best_score_)
    print('Best Params : ', pipe.best_params_)
    print('\n')

Random Forest 

Best Score :  0.7796417334032631
Best Params :  {'clf__n_estimators': 1000, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 8, 'clf__max_depth': 20}


XGBoost 

Best Score :  0.7555789894162791
Best Params :  {'clf__reg_lambda': 1, 'clf__reg_alpha': 40, 'clf__n_estimators': 180, 'clf__min_child_weight': 0, 'clf__max_depth': 5, 'clf__gamma': 2, 'clf__colsample_bytree': 0.5}


