In [1]:
import numpy as np  
import pandas as pd  
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier
import pyarrow.feather as feather
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
# Suppress warnings 
# (sometimes you might want to ignore warnings, that's how you can achieve this)
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install xgboost



In [3]:
export_df = feather.read_feather("../data/cleaned_data.feather")

In [4]:
basemodel_df=pd.get_dummies(data=export_df,columns= ["cause","effect"],drop_first=True)

In [5]:
basemodel_df.drop(['event_timestamp', 'event_name', 'user_id', 'document_id','surrogate_id', 'created_at', 'published_at', 'closed_at','notif_viewed_ontime', 'reaction_time','description','area_of_effect_coordinates_latitude','area_of_effect_coordinates_longitude'],axis=1, inplace=True)
#dropping description and coordinates for base model

In [6]:
# Function to split the dataset 
def splitdataset(df):
    train_size = int(len(df) * 0.8)
    train, test = df[0:train_size], df[train_size:len(df)]
    X_train= train.drop(["opened"], axis=1)
    X_test= test.drop(["opened"], axis=1)
    y_train= train["opened"]
    y_test= test["opened"]
    return X_train, X_test, y_train, y_test

In [7]:
# Function to perform training with MSE. 
def train_xgb(X_train, X_test, y_train): 
    # Creating the classifier object 
    xgb_class = XGBClassifier(use_label_encoder=False,eval_metric= "logloss")
    xgb_parametering= {'scale_pos_weight': [1, 10, 25, 50, 75, 99]}
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    xgb_grid = GridSearchCV(estimator=xgb_class, param_grid=xgb_parametering, cv=cv,scoring='balanced_accuracy',
    verbose=10, n_jobs=-1)
    grid_result=xgb_grid.fit(X_train,y_train)
    # report the best configuration
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    return grid_result
    # IS IT ALREADY OK?

In [8]:
# Function to make predictions 
def prediction(X_test, model): 

    # Predicton on test with giniIndex 
    y_pred = model.predict(X_test)
    #print("Predicted values:\n") 
    print(y_pred)
     
    return y_pred 

In [9]:
# Function to calculate accuracy 
def class_metrics(y_test, y_pred):
     
    #accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred).round()
    print("Predicted values:\n", y_pred) 
    print("Confusion Matrix: \n", cm) 
    #print("Accuracy: %.4f%%" % (accuracy * 100.0))
    scores = cross_val_score(xgb_class, y_test, y_pred, scoring='balanced_accuracy', cv=cv, n_jobs=-1)
    print("Report : \n", scores)

In [10]:
# Driver code 
def main(): 
    # Building Phase 
    X_train, X_test, y_train, y_test = splitdataset(basemodel_df) 
    basemodel = train_xgb(X_train, X_test, y_train)
    # Operational Phase 
    print("-----"*15)
    print("Results:\n")
    # Prediction
    y_pred = prediction(X_test, basemodel) 
    class_metrics(y_test, y_pred)
    return basemodel,y_pred

In [11]:
basemodel=main()

Fitting 30 folds for each of 6 candidates, totalling 180 fits
[CV 2/30; 1/6] START scale_pos_weight=1.........................................
[CV 3/30; 1/6] START scale_pos_weight=1.........................................
[CV 1/30; 1/6] START scale_pos_weight=1.........................................
[CV 5/30; 1/6] START scale_pos_weight=1.........................................
[CV 4/30; 1/6] START scale_pos_weight=1.........................................
[CV 6/30; 1/6] START scale_pos_weight=1.........................................
[CV 7/30; 1/6] START scale_pos_weight=1.........................................
[CV 8/30; 1/6] START scale_pos_weight=1.........................................
[CV 6/30; 1/6] END ..........scale_pos_weight=1;, score=0.500 total time= 7.3min
[CV 5/30; 1/6] END ..........scale_pos_weight=1;, score=0.500 total time= 7.4min
[CV 9/30; 1/6] START scale_pos_weight=1.........................................
[CV 8/30; 1/6] END ..........scale_pos_weight=1