In [1]:
import numpy as np
import pandas as pd  
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
import pyarrow.feather as feather
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
!pip install shap
import shap
!pip install plotly
import plotly.express as px
# Suppress warnings 
# (sometimes you might want to ignore warnings, that's how you can achieve this)
import warnings
warnings.filterwarnings('ignore')
#! brew install graphviz
RSEED= 42



In [2]:
export_df = feather.read_feather("../data/cleaned_data.feather")

In [3]:
export_df['lenght']=[row.astype(int) for row in export_df['lenght']]

In [4]:
# Function to split the dataset 
def splitdataset(df):

    y=export_df["interesting_message"]
    X=export_df.drop("interesting_message",axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2,random_state = 42)
    return X_train, X_test, y_train, y_test

In [6]:
def train_tree(X_train, y_train): 
    # Creating the classifier object 
    reg_tree = DecisionTreeClassifier(class_weight="balanced")
    param_tree = {'max_leaf_nodes':[20,50,100,500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 5, 10, 50],
    'min_samples_leaf':[50,200,500,1000]
    }

    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    grid_tree = GridSearchCV(reg_tree, param_grid=param_tree, cv=cv,scoring='balanced_accuracy', n_jobs=-1, verbose=10)
    grid_tree.fit(X_train, y_train)
    print('Best score:\n{:.2f}'.format(grid_tree.best_score_))
    print("Best parameters:\n{}".format(grid_tree.best_params_))
    print("Best model_tree:\n{}".format(grid_tree.best_estimator_))
    best_model_tree = grid_tree.best_estimator_

    return best_model_tree

In [7]:
# Function to make predictions 
def prediction(X_test, reg_tree): 
    y_pred = reg_tree.predict(X_test)
    return y_pred 

In [8]:
# Function to calculate accuracy 
def class_metrics(y_test, y_pred):
     
    accuracy = balanced_accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred).round()
    print("Predicted values:\n", y_pred) 
    print("Confusion Matrix: \n", cm) 
    print("Balanced Accuracy: %.4f%%" % (accuracy * 100.0))
    print("Report : \n", classification_report(y_test, y_pred))

    return cm, accuracy

In [9]:
def plot_importance (model,X_train):
    explainer=shap.TreeExplainer(model,data=X_train)
    shap_values = explainer.shap_values(X_train)
    vals= np.abs(shap_values).mean(0)
    feature_importance = pd.DataFrame(list(zip(X_train.columns, sum(vals))), columns=['col_name','feature_importance_vals'])
    ordered_weights= feature_importance.sort_values(by=['feature_importance_vals'], ascending=False,inplace=True)
    shap.summary_plot(shap_values, X_train, plot_type="bar")
    
    return ordered_weights


In [10]:
# Driver code 
def main(): 
    # Building Phase 
    X_train, X_test, y_train, y_test = splitdataset(export_df) 
    basemodel = train_tree(X_train, y_train)
    print(f'Decision tree has {basemodel.tree_.node_count} nodes with maximum depth {basemodel.tree_.max_depth}.')
    # Operational Phase 
    print("-----"*15)
    print("Results:\n")
    # Prediction
    y_pred = prediction(X_test, basemodel) 
    cm,accuracy= class_metrics(y_test, y_pred)
    return basemodel,y_pred,cm,accuracy,X_train

In [11]:
basemodel,y_pred,cm,accuracy,X_train=main()

Fitting 15 folds for each of 2 candidates, totalling 30 fits
[CV 7/15; 1/2] START criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50
[CV 4/15; 1/2] START criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50
[CV 5/15; 1/2] START criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50
[CV 3/15; 1/2] START criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50
[CV 2/15; 1/2] START criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50
[CV 1/15; 1/2] START criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50
[CV 6/15; 1/2] START criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50
[CV 8/15; 1/2] START criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50
[CV 1/15; 1/2] END criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50;, score=0.930 total time=  44.1s
[CV 2/15; 1/2] END criterion=gini, max_depth=50, max_leaf_nodes=500, min_samples_leaf=50;

In [None]:
ordered_weights=plot_importance (basemodel,X_train)


In [None]:
from sklearn.tree import export_graphviz  
features=['lenght', 'emoji_size', 'slang_char', 'slang_verb', 'slang_pron',
       'slang_adp', 'slang_noun', 'slang_num', 'slang_punt', 'slang_det',
       'info_char', 'info_words', 'info_verb', 'info_pron', 'info_adp',
       'info_noun', 'info_num', 'info_punt', 'info_det', 'cause_BrokenVehicle',
       'cause_COVID19', 'cause_Counterflow', 'cause_CycleRide',
       'cause_Demonstration', 'cause_EmergencyServices', 'cause_Event',
       'cause_Explosion', 'cause_FallenTree', 'cause_Fire', 'cause_Flood',
       'cause_GasLeak', 'cause_HeavyTraffic', 'cause_Incident',
       'cause_Landslide', 'cause_Leak', 'cause_Maintenance', 'cause_March',
       'cause_Overturn', 'cause_Pilgrimage', 'cause_ProtestCamp', 'cause_Rain',
       'cause_Reopening', 'cause_Sinkhole', 'cause_StreetWorks',
       'cause_VehicularAccident', 'cause_Waterlogging',
       'effect_CirculationRestored', 'effect_CirculationShutdown',
       'effect_Delays', 'effect_Evacuation', 'effect_FullCapacity',
       'effect_HighWaitingTime', 'effect_InterimService',
       'effect_LaneReduction', 'effect_RouteDetour', 'effect_SecuritySpeed',
       'effect_SuspensionOfService', 'effect_TrafficImpact']
# export the decision tree to a tree.dot file 
#for visualizing the plot easily anywhere 
export_graphviz(basemodel, out_file ='tree_class.dot', 
               feature_names =features) 

In [None]:
! dot -Tpng tree_class.dot > tree_class.png

In [None]:
f = open('ml-log.txt', 'a')
f.write('Base model: Decision Tree\n Predicted values:\n {}\n Confusion Matrix:\n {}\n Balanced Accuracy:\n {} \n Model: {}'.format(y_pred,cm,accuracy,basemodel))
f.close()