In [None]:
import numpy as np  
import pandas as pd  
from sklearn.model_selection import RepeatedStratifiedKFold
!pip install xgboost
from xgboost import XGBClassifier
import pyarrow.feather as feather
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
import shap
import warnings
import shap
warnings.filterwarnings('ignore')
RSEED= 12

# Exporting dataframe
### Changing the format of a column that was causing problems

In [None]:
export_df = feather.read_feather("../data/cleaned_data.feather")
export_df['lenght']=[row.astype(int) for row in export_df['lenght']]

# Functions for running XGBoost

In [None]:
# Function to split the dataset 
def splitdataset(df):
    ''' 
    Function for spliting the dataset, aiming four our target and stratifying the data.
    Return train and test variables.
    '''
    y=export_df["interesting_message"]
    X=export_df.drop("interesting_message",axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2,random_state = RSEED)
    print(y_train.dtypes)
    return X_train, X_test, y_train, y_test

In [None]:
def train_xgb(X_train, X_test, y_train):
    '''
    Function for training xgboost, it includes hyperparameter tuning and cross validation.
    Returns best model achieved.
    '''
    
    # Creating the classifier object 
    xgb_class = XGBClassifier(use_label_encoder=False,eval_metric= "logloss")
    xgb_parametering= {'scale_pos_weight': [4]}
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    xgb_grid = GridSearchCV(estimator=xgb_class, param_grid=xgb_parametering, cv=cv,scoring='balanced_accuracy',
    verbose=10, n_jobs=-1)
    grid_result=xgb_grid.fit(X_train,y_train)
    best_model=grid_result.best_estimator_
    # report the best configuration
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    return best_model

In [None]:
# Function to make predictions 
def prediction(X_test, model):
    '''
    Function for predicting classification with input model.
    Return prediction values.
    ''' 
    y_pred = model.predict(X_test)
    return y_pred 

In [None]:
# Function to calculate accuracy 
def class_metrics(y_test, y_pred): 
    '''
    Function to evaluate results achieved on prediction.
    Returns confusion matrix and balanced accuracy.
    '''
    accuracy = balanced_accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred).round()
    print("Predicted values:\n", y_pred) 
    print("Confusion Matrix: \n", cm) 
    print("Balanced Accuracy: %.4f%%" % (accuracy * 100.0))
    print("Report : \n", classification_report(y_test, y_pred))
    return cm, accuracy

In [None]:
def plot_importance (model,X_train):
    '''
    Function for plotting the shap values (importance of features) of the given model.
    It plots the most important values, and returns theses values and the explainer model.
    '''
    explainer=shap.TreeExplainer(model,data=X_train)
    shap_values = explainer.shap_values(X_train)
    shap.summary_plot(shap_values, X_train, plot_type="bar")
    return shap_values,explainer

In [None]:
def get_importances (shap_values):
    '''
    Function for getting the shap values (importance of features).
    Prints the head of these values and returns the values in descending order of importance.
    '''
    vals= np.abs(shap_values).mean(0)
    feature_importance = pd.DataFrame(list(zip(X_train.columns,vals)),columns=['col_name','feature_importance_vals'])
    feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
    feature_importance.head()
    return feature_importance

In [None]:
# Driver code 
def main():
    '''
    Main for running the xgboost modeling.
    Returns base model, prediction, confusion matrix, balanced accuracy and the features split for the importance analysis.
    '''
    # Building Phase 
    X_train, X_test, y_train, y_test = splitdataset(export_df) 
    basemodel = train_xgb(X_train, X_test, y_train)
    # Operational Phase 
    print("-----"*15)
    print("Results:\n")
    # Prediction
    y_pred = prediction(X_test, basemodel) 
    cm,accuracy=class_metrics(y_test, y_pred)
    return basemodel,y_pred,cm,accuracy,X_train

# Running XGBoost model

In [None]:
basemodel,y_pred,cm,accuracy,X_train=main()

### Getting the importance of the features

In [None]:
shap_values,explainer=plot_importance (basemodel,X_train)

### Getting the importance of features in 4 different example, one for each case (TN,TP,FP,FN)

In [None]:
X_train, X_test, y_train, y_test=splitdataset(export_df)
data_for_prediction = X_test.iloc[0:1,:].values.astype('float')
background = X_train.iloc[0:100,:].values.astype('float')

In [None]:
#TRUE NEGATIVE
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0],data_for_prediction, feature_names=X_test.columns)

In [None]:
#FALSE NEGATIVE
shap.force_plot(explainer.expected_value,shap_values[1],data_for_prediction, feature_names=X_test.columns)

In [None]:
shap.initjs()
#TRUE POSITIVE
shap.force_plot(explainer.expected_value,shap_values[5],data_for_prediction, feature_names=X_test.columns)

In [None]:
shap.initjs()
#FALSE POSITIVE
shap.force_plot(explainer.expected_value,shap_values[10],data_for_prediction, feature_names=X_test.columns)

### Saving some of the values of our model on a txt file

In [None]:
f = open('ml-log_xgboost.txt', 'a')
f.write('Base model: XGBoost\n Predicted values:\n {}\n Confusion Matrix:\n {}\n Balanced Accuracy:\n {} \n Model: {}'.format(y_pred,cm,accuracy,basemodel))
f.close()