In [1]:
import pandas as pd

# Load datasets
completed_orders = pd.read_csv('data/completed_orders.csv')
delivery_requests = pd.read_csv('data/driver_locations_during_request.csv')
weather = pd.read_csv('data/weather_data.csv')
calendar = pd.read_csv('data/holiday_data.csv')

In [4]:
# Merge datasets on relevant keys
# Assuming Trip ID corresponds to order_id for merging purposes

data = pd.merge(completed_orders, delivery_requests, left_on='Trip ID', right_on='order_id')
data = pd.merge(data, weather, left_on=data['Trip Start Time'].str[:10], right_on='date')
# data = pd.merge(data, calendar, left_on='Trip Start Time'.str[:10], right_on='date')
data.head()

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time,id,order_id,driver_id,driver_action,lat,lng,created_at,updated_at,date,temperature,rain
0,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,1,392001,243828,accepted,6.602207,3.270465,,,2021-07-01,301.8,0
1,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2,392001,243588,rejected,6.592097,3.287445,,,2021-07-01,301.8,0
2,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,3,392001,243830,rejected,6.596133,3.281784,,,2021-07-01,301.8,0
3,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,4,392001,243539,rejected,6.596142,3.280526,,,2021-07-01,301.8,0
4,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,5,392001,171653,rejected,6.609232,3.2888,,,2021-07-01,301.8,0


In [5]:
# Create features like trip duration, distance travelled etc.
data['Trip Start Time'] = pd.to_datetime(data['Trip Start Time'])
data['Trip End Time'] = pd.to_datetime(data['Trip End Time'])
data['trip_duration'] = (data['Trip End Time'] - data['Trip Start Time']).dt.total_seconds() / 60  # Duration in minutes

data.head()

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time,id,order_id,driver_id,driver_action,lat,lng,created_at,updated_at,date,temperature,rain,trip_duration
0,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,1,392001,243828,accepted,6.602207,3.270465,,,2021-07-01,301.8,0,3.616667
1,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2,392001,243588,rejected,6.592097,3.287445,,,2021-07-01,301.8,0,3.616667
2,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,3,392001,243830,rejected,6.596133,3.281784,,,2021-07-01,301.8,0,3.616667
3,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,4,392001,243539,rejected,6.596142,3.280526,,,2021-07-01,301.8,0,3.616667
4,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,5,392001,171653,rejected,6.609232,3.2888,,,2021-07-01,301.8,0,3.616667


In [7]:
import numpy as np

# Encode categorical variables
# data = pd.get_dummies(data, columns=['type'], drop_first=True)

# Define unfulfilled requests as target variable
data['unfulfilled_request'] = np.where(data['driver_action'] == 'unfulfilled', 1, 0)

# Inspect the merged dataset
print(data.head())

   Trip ID          Trip Origin     Trip Destination     Trip Start Time  \
0   392001  6.6010417,3.2766339  6.4501069,3.3916154 2021-07-01 09:30:59   
1   392001  6.6010417,3.2766339  6.4501069,3.3916154 2021-07-01 09:30:59   
2   392001  6.6010417,3.2766339  6.4501069,3.3916154 2021-07-01 09:30:59   
3   392001  6.6010417,3.2766339  6.4501069,3.3916154 2021-07-01 09:30:59   
4   392001  6.6010417,3.2766339  6.4501069,3.3916154 2021-07-01 09:30:59   

        Trip End Time  id  order_id  driver_id driver_action       lat  \
0 2021-07-01 09:34:36   1    392001     243828      accepted  6.602207   
1 2021-07-01 09:34:36   2    392001     243588      rejected  6.592097   
2 2021-07-01 09:34:36   3    392001     243830      rejected  6.596133   
3 2021-07-01 09:34:36   4    392001     243539      rejected  6.596142   
4 2021-07-01 09:34:36   5    392001     171653      rejected  6.609232   

        lng  created_at  updated_at        date  temperature  rain  \
0  3.270465         NaN     

In [8]:
from sklearn.model_selection import train_test_split

# Split data into training (70%) and hold-out set (30%)
train_data, holdout_data = train_test_split(data, test_size=0.3, random_state=42)

print("Training Data Shape:", train_data.shape)
print("Holdout Data Shape:", holdout_data.shape)


Training Data Shape: (1088748, 18)
Holdout Data Shape: (466607, 18)


In [35]:
from causalnex.structure import StructureModel
from causalnex.structure.notears import from_pandas
from causalnex.network import BayesianNetwork
import matplotlib.pyplot as plt
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
import codecs

# Select features for causal graph
features = ['trip_duration', 'temperature', 'rain', 'unfulfilled_request']

# Create causal graph
sm = from_pandas(train_data[features], w_threshold=0.8)

# Visualize the causal graph
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

# Extract the HTML content
html_content = viz.generate_html()

# Save the HTML content with UTF-8 encoding
with codecs.open('causal_graph_full.html', 'w', encoding='utf-8') as file:
    file.write(html_content)

#open the file in the default web browser
import webbrowser
webbrowser.open('causal_graph_full.html')


True

In [34]:

# Convert to Bayesian Network for further analysis
bn = BayesianNetwork(sm)

# Display learned structure
sm.edges()

ValueError: The given structure has 3 separated graph components. Please make sure it has only one.