In [1]:
import pandas as pd

# Path to your Parquet gzip file
file_path_11 = '../../1_Data/CLEANED/interventions_dataset.parquet'

# Read the Parquet file into a pandas DataFrame
interventions_dataset = pd.read_parquet(file_path_11, engine='pyarrow')

In [2]:
import pandas as pd
import plotly.express as px

# Assuming you have a DataFrame called interventions_dataset with columns t0_Hour and eventtype_trip

# Filter the dataset for event_type_P022 - Intoxication medication
#intoxication_events = interventions_dataset[interventions_dataset['eventtype_trip'] == 'P022 - Intoxication medication']
#intoxication_events = interventions_dataset[interventions_dataset['eventtype_trip'] == 'P010 - Respiratory problems']
#intoxication_events = interventions_dataset[interventions_dataset['eventtype_trip'] == 'P031 - Psychiatric problem']
#intoxication_events = interventions_dataset[interventions_dataset['eventtype_trip'] == 'P020 - Intoxication alcohol']
#intoxication_events = interventions_dataset[interventions_dataset['eventtype_trip'] == 'P067 - Social problem']
#intoxication_events = interventions_dataset[interventions_dataset['eventtype_trip'] == 'P013 - Non-traumatic back pain']
intoxication_events = interventions_dataset[interventions_dataset['eventtype_trip'] == 'P002 - Agression - fight - rape']

# Group by t0_Hour and count the number of events for each hour
hourly_counts = intoxication_events.groupby('t0_Hour').size().reset_index(name='event_count')

# Plot the total events per t0_Hour using Plotly Express
fig = px.line(hourly_counts, x='t0_Hour', y='event_count', title='Total Events of Intoxication Medication per Hour')
fig.update_xaxes(title='Hour of the Day')
fig.update_yaxes(title='Total Number of Events')
fig.show()

In [3]:
import pandas as pd
import plotly.express as px

# Convert 't0' column to datetime type
interventions_dataset['t0'] = pd.to_datetime(interventions_dataset['t0'])

# Extract the date from the t0 column
interventions_dataset['t0_date'] = interventions_dataset['t0'].dt.date

# Group by t0_date and eventlevel_trip, then count the number of events in each group
event_counts = interventions_dataset.groupby(['t0_date', 'eventlevel_trip']).size().reset_index(name='event_count')

# Create an interactive line plot with Plotly Express
fig = px.line(event_counts, x='t0_date', y='event_count', color='eventlevel_trip', title='Total Number of Events by Event Level Over Time',
              labels={'t0_date': 'Date', 'event_count': 'Total Number of Events', 'eventlevel_trip': 'Event Level'})

# Show the plot
fig.show()


In [4]:
import pandas as pd
import plotly.graph_objects as go
from scipy.stats import zscore

# Convert 't0' column to datetime if needed
interventions_dataset['t0'] = pd.to_datetime(interventions_dataset['t0'])

# Extract the date from the t0 column
interventions_dataset['t0_date'] = interventions_dataset['t0'].dt.date

# Group by t0_date and eventtype_trip, then count the number of events in each group
event_counts = interventions_dataset.groupby(['t0_date', 'eventtype_trip']).size().reset_index(name='event_count')

# Calculate z-scores for each event type
event_counts['z_score'] = event_counts.groupby('eventtype_trip')['event_count'].transform(lambda x: zscore(x))

# Filter event types with at least one data point beyond 3 standard deviations from the mean
outlier_event_types = event_counts[event_counts['z_score'].abs() > 3]['eventtype_trip'].unique()

# Filter event counts for outlier event types
event_counts_filtered = event_counts[event_counts['eventtype_trip'].isin(outlier_event_types)]

# Create an interactive line plot with Plotly Express
fig = go.Figure()

# Add traces for each event type
for event_type in event_counts_filtered['eventtype_trip'].unique():
    df = event_counts_filtered[event_counts_filtered['eventtype_trip'] == event_type]
    fig.add_trace(go.Scatter(x=df['t0_date'], y=df['event_count'], mode='lines', name=event_type, visible='legendonly'))

# Add title and axis labels
fig.update_layout(
    title='Total Number of Events by Event Type Over Time (Outliers Only)',
    xaxis_title='Date',
    yaxis_title='Total Number of Events'
)

# Show the plot
fig.show()

In [5]:
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Assuming you have a DataFrame called 'interventions_dataset' containing the data

# Convert 'T0' to datetime object using the correct format
interventions_dataset['t0'] = pd.to_datetime(interventions_dataset['t0'], format='%Y-%m-%d %H:%M:%S.%f')

# Filter the dataset to include only the top 5 most common event types
top_5_event_types = interventions_dataset['eventtype_trip'].value_counts().head(5).index
filtered_df = interventions_dataset[interventions_dataset['eventtype_trip'].isin(top_5_event_types)]

# Group by date and event type, and count occurrences
grouped = filtered_df.groupby([filtered_df['t0'].dt.date, 'eventtype_trip']).size().unstack().fillna(0)

# Sum the counts for each event type for each day
daily_totals = grouped.groupby(grouped.index).sum()

# Create traces for each event type
traces = []
for event_type in daily_totals.columns:
    trace = go.Scatter(x=daily_totals.index, y=daily_totals[event_type], mode='lines', name=event_type)
    traces.append(trace)

# Create the figure
fig = make_subplots(rows=1, cols=1)

# Add traces to the figure
for trace in traces:
    fig.add_trace(trace)

# Update layout
fig.update_layout(
    title='Total Number of Top 5 Event Types by Day',
    xaxis_title='Date',
    yaxis_title='Total Number of Occurrences',
    legend_title='Event Types'
)

# Show the figure
fig.show()

In [6]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Assuming you have a DataFrame called 'interventions_dataset' containing the data

# Convert 'T0' to datetime object using the correct format
interventions_dataset['t0'] = pd.to_datetime(interventions_dataset['t0'], format='%Y-%m-%d %H:%M:%S.%f')

# Group by date and event type, and count occurrences
grouped = interventions_dataset.groupby([interventions_dataset['t0'].dt.date, 'eventtype_trip']).size().unstack().fillna(0)

# Calculate the total occurrences for each event type
event_type_totals = grouped.sum()

# Filter the event types with fewer than 10 total occurrences
event_types_to_keep = event_type_totals[event_type_totals >= 10].index
grouped_filtered = grouped[event_types_to_keep]

# Calculate z-scores for each event type's daily occurrence count
z_scores = (grouped_filtered - grouped_filtered.mean()) / grouped_filtered.std()

# Calculate the difference between the maximum and minimum z-score for each event type
z_score_diff = z_scores.max() - z_scores.min()

# Get the event types with the largest z-score differences
event_types_to_plot = z_score_diff.nlargest(10).index  # Adjust the number of event types to plot as needed

# Filter the grouped data for the selected event types
filtered_grouped = grouped_filtered[event_types_to_plot]

# Create traces for each selected event type
traces = []
for event_type in filtered_grouped.columns:
    trace = go.Scatter(x=filtered_grouped.index, y=filtered_grouped[event_type], mode='lines', name=event_type)
    traces.append(trace)

# Create the figure
fig = make_subplots(rows=1, cols=1)

# Add traces to the figure
for trace in traces:
    fig.add_trace(trace)

# Update layout
fig.update_layout(
    title='Total Number of Event Types with Largest Z-Score Differences by Day',
    xaxis_title='Date',
    yaxis_title='Total Number of Occurrences',
    legend_title='Event Types'
)

# Show the figure
fig.show()


In [7]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Assuming you have a DataFrame called 'interventions_dataset' containing the data

# Convert 'T0' to datetime object using the correct format
interventions_dataset['t0'] = pd.to_datetime(interventions_dataset['t0'], format='%Y-%m-%d %H:%M:%S.%f')

# Group by date and event type, and count occurrences
grouped = interventions_dataset.groupby([interventions_dataset['t0'].dt.date, 'eventtype_trip']).size().unstack().fillna(0)

# Calculate the correlation between each event type's daily occurrence count and time
correlations = grouped.apply(lambda x: x.corr(pd.Series(range(len(x)), index=x.index)), axis=0)

# Sort event types based on their correlation values
sorted_event_types = correlations.abs().sort_values(ascending=False).index

# Select the top 5 event types with the strongest correlation (positive or negative) with time
top_10_event_types = sorted_event_types[:10]

# Filter the grouped data for the selected event types
filtered_grouped = grouped[top_10_event_types]

# Create a line plot for each event type separately
fig = make_subplots(rows=1, cols=1)

for event_type in filtered_grouped.columns:
    fig.add_trace(go.Scatter(x=filtered_grouped.index, y=filtered_grouped[event_type], mode='lines', name=event_type))

# Update layout
fig.update_layout(
    title='Trending Event Types Over Time',
    xaxis_title='Date',
    yaxis_title='Number of Events',
    legend_title='Event Types'
)

# Show the figure
fig.show()