In [1]:
import numpy as np
import pandas as pd  
from sklearn.tree import DecisionTreeRegressor  
import pyarrow.feather as feather
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
# Suppress warnings 
# (sometimes you might want to ignore warnings, that's how you can achieve this)
import warnings
warnings.filterwarnings('ignore')
#! brew install graphviz
RSEED= 42

In [2]:
export_df = feather.read_feather("../data/cleaned_data.feather")

In [3]:
#Previous step to get the target opening ratio
gpbyincident= export_df.groupby( by= ['document_id'])
gpbyincident2=gpbyincident.sum()
gpbyincident2['count1'] = gpbyincident.size()
gpbyincident2['opened_rate'] = gpbyincident2['opened']/gpbyincident2['count1']
gpbyincident2['ontime_activity_rate'] = gpbyincident2["notif_viewed_ontime"]/gpbyincident2['count1']
merge_column=gpbyincident2['opened_rate'].copy()

In [4]:
basemodel_df= export_df.merge(merge_column, left_on="document_id", right_index=True)

In [5]:
basemodel_df=pd.get_dummies(data=basemodel_df,columns= ["cause","effect"],drop_first=True)

In [6]:
basemodel_df.drop(['event_timestamp', 'event_name', 'user_id', 'document_id','surrogate_id', 'created_at', 'published_at', 'closed_at','notif_viewed_ontime', 'reaction_time','opened','description','area_of_effect_coordinates_latitude','area_of_effect_coordinates_longitude'],axis=1, inplace=True)
#dropping description and coordinates for base model

In [7]:
# Function to split the dataset 
def splitdataset(df): 
    # Separating the target variable 
    #X = balance_data.values[:, 1:5] 
    #Y = df.values[:, 28]
    train_size = int(len(df) * 0.8)
    train, test = df[0:train_size], df[train_size:len(df)]
    X_train= train.drop(["opened_rate"], axis=1)
    X_test= test.drop(["opened_rate"], axis=1)
    y_train= train["opened_rate"]
    y_test= test["opened_rate"]
    print('Observations: %d' % (len(train) + len(test)))
    print('Training Observations: %d' % (len(train)))
    print('Testing Observations: %d' % (len(test)))
    return X_train, X_test, y_train, y_test

# Function to perform training with MSE. 
def train_tree(X_train, X_test, y_train): 
    # Creating the classifier object 
    reg_tree = DecisionTreeRegressor(criterion = "mse", max_leaf_nodes=100) 
    # Performing training 
    reg_tree.fit(X_train, y_train)
    return reg_tree
    #max_depth=3, min_samples_leaf=5

In [8]:
# Function to perform training with MSE. 
def train_tree(X_train, X_test, y_train): 
    # Creating the classifier object 
    reg_tree = DecisionTreeRegressor(criterion = "mse",)
    param_tree = {"splitter":["best","random"],
    'max_leaf_nodes':[20,50,100,200,500]}


    'max_leaf_nodes':[20,50,100,200,500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 5, 10, 20,50],
    'min_samples_leaf':[50,200,500,1000]

    grid_tree = GridSearchCV(reg_tree, param_grid=param_tree, cv=5,scoring='neg_root_mean_squared_error',verbose=5, n_jobs=-1)
    grid_tree.fit(X_train, y_train)
    print('Best score:\n{:.2f}'.format(grid_tree.best_score_))
    print("Best parameters:\n{}".format(grid_tree.best_params_))
    print("Best model_tree:\n{}".format(grid_tree.best_estimator_))
    best_model_tree = grid_tree.best_estimator_
    return best_model_tree
    #max_depth=3, min_samples_leaf=5

In [9]:
# Function to make predictions 
def prediction(X_test, reg_tree): 

    # Predicton on test with giniIndex 
    y_pred = reg_tree.predict(X_test)
    #print("Predicted values:\n") 
    #print(y_pred) 
    return y_pred 

In [10]:
# Function to calculate accuracy 
def reg_metrics(y_test, y_pred, X_train):
     

    rmse = np.sqrt(mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test,y_pred)

    # Scikit-learn doesn't have adjusted r-square, hence custom code
    n = y_pred.shape[0]
    k = X_train.shape[1]
    adj_r_sq = 1 - (1 - r2)*(n-1)/(n-1-k)

    print(rmse, r2, adj_r_sq)

In [11]:
# Driver code 
def main(): 
    # Building Phase 
    X_train, X_test, y_train, y_test = splitdataset(basemodel_df) 
    basemodel = train_tree(X_train, X_test, y_train)
    print(f'Decision tree has {basemodel.tree_.node_count} nodes with maximum depth {basemodel.tree_.max_depth}.')
    # Operational Phase 
    print("-----"*15)
    print("Results:\n")
    # Prediction
    y_pred = prediction(X_test, basemodel) 
    reg_metrics(y_test, y_pred, X_test)
    return basemodel

In [12]:
#from IPython.display import Image
#Image("Tree_GameProduction.png")

In [13]:
basemodel_df.head()

Unnamed: 0,agency_GewRJAw5tUmC4Ku4AX1-SQ,agency_GtvOEQAFZ0GtU6u4AXwvPg,agency_HE59N3RXM0q5vKu4AXlQZg,agency_JUR9bFXmVkWDHqu4AXaY0g,agency_JfA8Bw8Zp024Kqu4AXiSpQ,agency_MgUq5b9mOEunx6u4AXt_BA,agency_NuuRQ2I1Q0a50Kv-AVKlLA,agency_V2AIQQKgmUO3VazvAOA-Cw,agency_jLjibFoim0iwWau4AWoEdQ,agency_pky7jovXYkaw-awAAMrQ3g,...,effect_Delays,effect_Evacuation,effect_FullCapacity,effect_HighWaitingTime,effect_InterimService,effect_LaneReduction,effect_RouteDetour,effect_SecuritySpeed,effect_SuspensionOfService,effect_TrafficImpact
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
12,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
33,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
68,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
72,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [14]:
basemodel=main()

Observations: 908723
Training Observations: 726978
Testing Observations: 181745
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 3/5] END ................max_leaf_nodes=20;, score=-0.018 total time=   5.9s
[CV 5/5] END ................max_leaf_nodes=20;, score=-0.024 total time=   5.9s
[CV 2/5] END ................max_leaf_nodes=20;, score=-0.023 total time=   6.4s
[CV 4/5] END ................max_leaf_nodes=20;, score=-0.025 total time=   6.3s
[CV 1/5] END ................max_leaf_nodes=20;, score=-0.017 total time=   6.4s
[CV 2/5] END ................max_leaf_nodes=50;, score=-0.023 total time=   6.3s
[CV 3/5] END ................max_leaf_nodes=50;, score=-0.018 total time=   6.6s
[CV 1/5] END ................max_leaf_nodes=50;, score=-0.017 total time=   7.1s
[CV 4/5] END ................max_leaf_nodes=50;, score=-0.025 total time=   4.4s
[CV 5/5] END ................max_leaf_nodes=50;, score=-0.025 total time=   4.5s
[CV 2/5] END ...............max_leaf_nodes=100;, s

In [15]:
basemodel.get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

In [16]:
from sklearn.tree import export_graphviz  
features=['agency_GewRJAw5tUmC4Ku4AX1-SQ', 'agency_GtvOEQAFZ0GtU6u4AXwvPg',
       'agency_HE59N3RXM0q5vKu4AXlQZg', 'agency_JUR9bFXmVkWDHqu4AXaY0g',
       'agency_JfA8Bw8Zp024Kqu4AXiSpQ', 'agency_MgUq5b9mOEunx6u4AXt_BA',
       'agency_NuuRQ2I1Q0a50Kv-AVKlLA', 'agency_V2AIQQKgmUO3VazvAOA-Cw',
       'agency_jLjibFoim0iwWau4AWoEdQ', 'agency_pky7jovXYkaw-awAAMrQ3g',
       'agency_zCy9zG00HEqGeKu4AWZYNQ', 'cause_BrokenVehicle',
       'cause_COVID19', 'cause_Counterflow', 'cause_CycleRide',
       'cause_Demonstration', 'cause_EmergencyServices', 'cause_Event',
       'cause_Explosion', 'cause_FallenTree', 'cause_Fire', 'cause_Flood',
       'cause_GasLeak', 'cause_HeavyTraffic', 'cause_Incident',
       'cause_Landslide', 'cause_Leak', 'cause_Maintenance', 'cause_March',
       'cause_Overturn', 'cause_Pilgrimage', 'cause_ProtestCamp', 'cause_Rain',
       'cause_Reopening', 'cause_Sinkhole', 'cause_StreetWorks',
       'cause_VehicularAccident', 'cause_Waterlogging',
       'effect_CirculationRestored', 'effect_CirculationShutdown',
       'effect_Delays', 'effect_Evacuation', 'effect_FullCapacity',
       'effect_HighWaitingTime', 'effect_InterimService',
       'effect_LaneReduction', 'effect_RouteDetour', 'effect_SecuritySpeed',
       'effect_SuspensionOfService', 'effect_TrafficImpact']
# export the decision tree to a tree.dot file 
#for visualizing the plot easily anywhere 
export_graphviz(basemodel, out_file ='tree.dot', 
               feature_names =features) 

In [17]:
! dot -Tpng tree.dot > tree.png