In [3]:
import pandas as pd  
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import pyarrow.feather as feather
from sklearn.model_selection import GridSearchCV
import mlflow
from mlflow.sklearn import save_model
from ../modeling.config import TRACKING_URI, EXPERIMENT_NAME
TRACKING_URI=open("../.mlflow_uri").read().strip()
# Suppress warnings 
# (sometimes you might want to ignore warnings, that's how you can achieve this)
import warnings
warnings.filterwarnings('ignore')
#! brew install graphviz
RSEED= 42

SyntaxError: invalid syntax (3816055627.py, line 10)

In [None]:
export_df = feather.read_feather("../data/cleaned_data.feather")

In [None]:
basemodel_df=pd.get_dummies(data=export_df,columns= ["cause","effect"],drop_first=True)

In [None]:
basemodel_df.drop(['event_timestamp', 'event_name', 'user_id', 'document_id','surrogate_id', 'created_at', 'published_at', 'closed_at','notif_viewed_ontime', 'reaction_time','description','area_of_effect_coordinates_latitude','area_of_effect_coordinates_longitude','opened','opened_rate'],axis=1, inplace=True)
#dropping description and coordinates for base model

In [None]:
# Function to split the dataset 
def splitdataset(df):
    X= basemodel_df.drop(["interesting_message"], axis=1)
    y= basemodel_df['interesting_message']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=True)
    return X_train, X_test, y_train, y_test

In [None]:
# Function to perform training with MSE. 
def train_tree(X_train, y_train): 
    # Starting the ML Flow
    logger.info("Training simple model and tracking with MLFlow")
    mlflow.set_tracking_uri(TRACKING_URI)
    mlflow.set_experiment(EXPERIMENT_NAME)
    # Creating the classifier object
    with mlflow.start_run():
        model = DecisionTreeClassifier(class_weight="balanced")
        params = {'max_leaf_nodes':[10,50,100],
        'criterion': ['gini'],
        'max_depth': [2, 5, 10, 50],
        'min_samples_leaf':[50,200,500]
        }
        mlflow.log_params(params)
        mlflow.set_tag("Base_model", "True")
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        grid_tree = GridSearchCV(model, param_grid=params, cv=cv,scoring='balanced_accuracy', n_jobs=-1, verbose=10)
        grid_tree.fit(X_train, y_train)
        print('Best score:\n{:.2f}'.format(grid_tree.best_score_))
        print("Best parameters:\n{}".format(grid_tree.best_params_))
        print("Best model_tree:\n{}".format(grid_tree.best_estimator_))
    best_model_tree = grid_tree.best_estimator_
    path = "../models/base_decision_tree"
    save_model(sk_model=best_model_tree, path=path)
    return best_model_tree
    #max_depth=3, min_samples_leaf=5

In [None]:
# Function to make predictions 
def prediction(X_test, model): 
    y_pred = model.predict(X_test)
    return y_pred 

In [None]:
# Function to calculate accuracy 
def class_metrics(y_test, y_pred):
     
    accuracy = balanced_accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred).round()
    logger.info(
        "Predicted values:\n {} \n Confusion Matrix: \n {} \n Balanced Accuracy: \n {} Report: \n {}".format(y_pred,cm,accuracy,classification_report(y_test, y_pred)))

    mlflow.log_metric(prefix + "-" + "Balanced Accuracy", accuracy)

    return cm, accuracy

In [None]:
# Driver code 
def main():
    warnings.filterwarnings("ignore")
    #logger = getLogger(__name__)
    # Building Phase
    #  
    X_train, X_test, y_train, y_test = splitdataset(basemodel_df) 
    basemodel = train_tree(X_train, y_train)
    print(f'Decision tree has {basemodel.tree_.node_count} nodes with maximum depth {basemodel.tree_.max_depth}.')
    # Operational Phase 
    print("-----"*15)
    print("Results:\n")
    # Prediction
    y_pred = prediction(X_test, basemodel) 
    cm,accuracy= class_metrics(y_test, y_pred)
    return basemodel,y_pred,cm,accuracy

In [None]:
basemodel,y_pred,cm,accuracy=main()

from sklearn.tree import export_graphviz  
features=['agency_GewRJAw5tUmC4Ku4AX1-SQ', 'agency_GtvOEQAFZ0GtU6u4AXwvPg',
       'agency_HE59N3RXM0q5vKu4AXlQZg', 'agency_JUR9bFXmVkWDHqu4AXaY0g',
       'agency_JfA8Bw8Zp024Kqu4AXiSpQ', 'agency_MgUq5b9mOEunx6u4AXt_BA',
       'agency_NuuRQ2I1Q0a50Kv-AVKlLA', 'agency_V2AIQQKgmUO3VazvAOA-Cw',
       'agency_jLjibFoim0iwWau4AWoEdQ', 'agency_pky7jovXYkaw-awAAMrQ3g',
       'agency_zCy9zG00HEqGeKu4AWZYNQ', 'cause_BrokenVehicle',
       'cause_COVID19', 'cause_Counterflow', 'cause_CycleRide',
       'cause_Demonstration', 'cause_EmergencyServices', 'cause_Event',
       'cause_Explosion', 'cause_FallenTree', 'cause_Fire', 'cause_Flood',
       'cause_GasLeak', 'cause_HeavyTraffic', 'cause_Incident',
       'cause_Landslide', 'cause_Leak', 'cause_Maintenance', 'cause_March',
       'cause_Overturn', 'cause_Pilgrimage', 'cause_ProtestCamp', 'cause_Rain',
       'cause_Reopening', 'cause_Sinkhole', 'cause_StreetWorks',
       'cause_VehicularAccident', 'cause_Waterlogging',
       'effect_CirculationRestored', 'effect_CirculationShutdown',
       'effect_Delays', 'effect_Evacuation', 'effect_FullCapacity',
       'effect_HighWaitingTime', 'effect_InterimService',
       'effect_LaneReduction', 'effect_RouteDetour', 'effect_SecuritySpeed',
       'effect_SuspensionOfService', 'effect_TrafficImpact']
# export the decision tree to a tree.dot file 
#for visualizing the plot easily anywhere 
export_graphviz(basemodel, out_file ='tree_class.dot', 
               feature_names =features) 

In [None]:
#! dot -Tpng tree_class.dot > tree_class.png

f = open('ml-log.txt', 'a')
f.write('Base model: Decision Tree\n Predicted values:\n {}\n Confusion Matrix:\n {}\n Balanced Accuracy:\n {} \n Model: {}'.format(y_pred,cm,accuracy,basemodel))
f.close()