In [1]:
import numpy as np  
import matplotlib.pyplot as plt 
import pandas as pd  
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import fbeta_score, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
import pyarrow.feather as feather
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# Suppress warnings 
# (sometimes you might want to ignore warnings, that's how you can achieve this)
import warnings
warnings.filterwarnings('ignore')
#! brew install graphviz
RSEED= 42

In [2]:
export_df = feather.read_feather("../data/cleaned_data.feather")

In [5]:
basemodel_df=pd.get_dummies(data=basemodel_df,columns= ["cause","effect"],drop_first=True)

In [6]:
basemodel_df.drop(['event_timestamp', 'event_name', 'user_id', 'document_id','surrogate_id', 'created_at', 'published_at', 'closed_at','notif_viewed_ontime', 'reaction_time','description','opened_rate','area_of_effect_coordinates_latitude','area_of_effect_coordinates_longitude'],axis=1, inplace=True)
#dropping description and coordinates for base model

In [7]:
# Function to split the dataset 
def splitdataset(df):
    train_size = int(len(df) * 0.8)
    train, test = df[0:train_size], df[train_size:len(df)]
    X_train= train.drop(["opened"], axis=1)
    X_test= test.drop(["opened"], axis=1)
    y_train= train["opened"]
    y_test= test["opened"]
    return X_train, X_test, y_train, y_test

In [14]:
# Function to perform training with MSE. 
def train_tree(X_train, X_test, y_train): 
    # Creating the classifier object 
    reg_tree = DecisionTreeClassifier(criterion = "gini")
    param_tree = {'max_leaf_nodes':[20,50,100,200,500]}
    grid_tree = GridSearchCV(reg_tree, param_grid=param_tree, cv=5,scoring='balanced_accuracy',
    verbose=5, n_jobs=-1)
    grid_tree.fit(X_train, y_train)
    print('Best score:\n{:.2f}'.format(grid_tree.best_score_))
    print("Best parameters:\n{}".format(grid_tree.best_params_))
    print("Best model_tree:\n{}".format(grid_tree.best_estimator_))
    best_model_tree = grid_tree.best_estimator_
    return best_model_tree
    #max_depth=3, min_samples_leaf=5

In [15]:
# Function to make predictions 
def prediction(X_test, reg_tree): 

    # Predicton on test with giniIndex 
    y_pred = reg_tree.predict(X_test)
    #print("Predicted values:\n") 
    print(y_pred)
     
    return y_pred 

In [16]:
# Function to calculate accuracy 
def class_metrics(y_test, y_pred):
     
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred).round()
    print("Predicted values:\n", y_pred) 
    print("Confusion Matrix: \n", cm) 
    print("Accuracy: %.4f%%" % (accuracy * 100.0))
    
    print("Report : \n", classification_report(y_test, y_pred))

In [17]:
# Driver code 
def main(): 
    # Building Phase 
    X_train, X_test, y_train, y_test = splitdataset(basemodel_df) 
    basemodel = train_tree(X_train, X_test, y_train)
    print(f'Decision tree has {basemodel.tree_.node_count} nodes with maximum depth {basemodel.tree_.max_depth}.')
    # Operational Phase 
    print("-----"*15)
    print("Results:\n")
    # Prediction
    y_pred = prediction(X_test, basemodel) 
    class_metrics(y_test, y_pred)
    return basemodel,y_pred

In [18]:
basemodel=main()

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END .................max_leaf_nodes=20;, score=0.500 total time=   6.8s
[CV 3/5] END .................max_leaf_nodes=20;, score=0.500 total time=   6.8s
[CV 4/5] END .................max_leaf_nodes=20;, score=0.500 total time=   6.7s
[CV 5/5] END .................max_leaf_nodes=20;, score=0.498 total time=   6.9s
[CV 2/5] END .................max_leaf_nodes=20;, score=0.500 total time=   7.2s
[CV 1/5] END .................max_leaf_nodes=50;, score=0.500 total time=   7.0s
[CV 2/5] END .................max_leaf_nodes=50;, score=0.500 total time=   7.5s
[CV 3/5] END .................max_leaf_nodes=50;, score=0.500 total time=   7.6s
[CV 5/5] END .................max_leaf_nodes=50;, score=0.498 total time=   4.4s
[CV 4/5] END .................max_leaf_nodes=50;, score=0.500 total time=   4.6s
[CV 2/5] END ................max_leaf_nodes=100;, score=0.500 total time=   5.7s
[CV 4/5] END ................max_leaf_nodes=100;,

from sklearn.tree import export_graphviz  
features=['agency_GewRJAw5tUmC4Ku4AX1-SQ', 'agency_GtvOEQAFZ0GtU6u4AXwvPg',
       'agency_HE59N3RXM0q5vKu4AXlQZg', 'agency_JUR9bFXmVkWDHqu4AXaY0g',
       'agency_JfA8Bw8Zp024Kqu4AXiSpQ', 'agency_MgUq5b9mOEunx6u4AXt_BA',
       'agency_NuuRQ2I1Q0a50Kv-AVKlLA', 'agency_V2AIQQKgmUO3VazvAOA-Cw',
       'agency_jLjibFoim0iwWau4AWoEdQ', 'agency_pky7jovXYkaw-awAAMrQ3g',
       'agency_zCy9zG00HEqGeKu4AWZYNQ', 'cause_BrokenVehicle',
       'cause_COVID19', 'cause_Counterflow', 'cause_CycleRide',
       'cause_Demonstration', 'cause_EmergencyServices', 'cause_Event',
       'cause_Explosion', 'cause_FallenTree', 'cause_Fire', 'cause_Flood',
       'cause_GasLeak', 'cause_HeavyTraffic', 'cause_Incident',
       'cause_Landslide', 'cause_Leak', 'cause_Maintenance', 'cause_March',
       'cause_Overturn', 'cause_Pilgrimage', 'cause_ProtestCamp', 'cause_Rain',
       'cause_Reopening', 'cause_Sinkhole', 'cause_StreetWorks',
       'cause_VehicularAccident', 'cause_Waterlogging',
       'effect_CirculationRestored', 'effect_CirculationShutdown',
       'effect_Delays', 'effect_Evacuation', 'effect_FullCapacity',
       'effect_HighWaitingTime', 'effect_InterimService',
       'effect_LaneReduction', 'effect_RouteDetour', 'effect_SecuritySpeed',
       'effect_SuspensionOfService', 'effect_TrafficImpact']
# export the decision tree to a tree.dot file 
#for visualizing the plot easily anywhere 
export_graphviz(basemodel, out_file ='tree_class.dot', 
               feature_names =features) 

In [19]:
! dot -Tpng tree_class.dot > tree_class.png

In [20]:
export_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908723 entries, 0 to 908722
Data columns (total 27 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   event_timestamp                       908723 non-null  datetime64[ns]
 1   event_name                            908723 non-null  object        
 2   user_id                               908723 non-null  object        
 3   document_id                           908723 non-null  object        
 4   surrogate_id                          908723 non-null  object        
 5   created_at                            908723 non-null  datetime64[ns]
 6   published_at                          908723 non-null  datetime64[ns]
 7   closed_at                             908060 non-null  datetime64[ns]
 8   cause                                 908723 non-null  object        
 9   effect                                908723 non-null  obje

In [21]:
export_df["area_of_effect_coordinates_latitude"].isnull().value_counts()

False    584293
True     324430
Name: area_of_effect_coordinates_latitude, dtype: int64