Notebook to create "top 5" or "top 10" lists of correlations between specified variables

In [1]:
import pandas as pd

# Path to your Parquet gzip file
file_path_11 = '../../1_Data/CLEANED/interventions_dataset.parquet'

# Read the Parquet file into a pandas DataFrame
interventions_dataset = pd.read_parquet(file_path_11, engine='pyarrow')

In [2]:
import pandas as pd

# Create dummy variables for eventtype_trip
eventtype_trip_dummies = pd.get_dummies(interventions_dataset['eventtype_trip'], prefix='eventtype')

# Create dummy variables for eventlevel_trip
eventlevel_trip_dummies = pd.get_dummies(interventions_dataset['eventlevel_trip'], prefix='eventlevel')

# Concatenate eventtype_trip and eventlevel_trip dummy variables
dummies = pd.concat([eventtype_trip_dummies, eventlevel_trip_dummies], axis=1)

# Calculate correlations between eventtype_trip dummy variables and eventlevel_trip dummy variables
correlations = {}
for column in eventlevel_trip_dummies.columns:
    correlations[column] = {}
    for dummy_column in eventtype_trip_dummies.columns:
        correlation = dummies[column].corr(dummies[dummy_column])
        correlations[column][dummy_column] = correlation

# Print top 5 correlations for each eventlevel_trip
for event_level, corr in correlations.items():
    sorted_corr = sorted(corr.items(), key=lambda x: abs(x[1]), reverse=True)[:5]
    print(f"Top 5 correlations for {event_level}:")
    for dummy, correlation in sorted_corr:
        print(f"{dummy}: {correlation}")
    print()


Top 5 correlations for eventlevel_5Z:
eventtype_P092 - MIP: 0.5835241120026031
eventtype_P033 - Trauma: -0.0033438337656477492
eventtype_P010 - Respiratory problems: -0.0024081923999367365
eventtype_P026 - Unclear problem: -0.002297668454922378
eventtype_P019 - Unconscious - syncope: -0.002053643402738778

Top 5 correlations for eventlevel_AG:
eventtype_Y_BR: 0.025027720755608187
eventtype_P033 - Trauma: -0.00042127083488380636
eventtype_P010 - Respiratory problems: -0.00030339463441767965
eventtype_P026 - Unclear problem: -0.0002894703433631489
eventtype_P019 - Unconscious - syncope: -0.0002587269976495963

Top 5 correlations for eventlevel_B:
eventtype_Y_BR: 0.9993716795657169
eventtype_Z_BR: 0.02502772075560805
eventtype_P033 - Trauma: -0.01683216937720532
eventtype_P010 - Respiratory problems: -0.012122343755560854
eventtype_P026 - Unclear problem: -0.011565989016329931

Top 5 correlations for eventlevel_BI:
eventtype_P096 - Out of service: 0.012421479198008017
eventtype_P033 - Tra

In [5]:
import pandas as pd

# Create dummy variables for eventtype_trip
eventtype_trip_dummies = pd.get_dummies(interventions_dataset['eventtype_trip'], prefix='eventtype')

# Create dummy variables for eventlevel_trip
eventlevel_trip_dummies = pd.get_dummies(interventions_dataset['vector_type'], prefix='vector')

# Concatenate eventtype_trip and eventlevel_trip dummy variables
dummies = pd.concat([eventtype_trip_dummies, eventlevel_trip_dummies], axis=1)

# Calculate correlations between eventtype_trip dummy variables and eventlevel_trip dummy variables
correlations = {}
for column in eventlevel_trip_dummies.columns:
    correlations[column] = {}
    for dummy_column in eventtype_trip_dummies.columns:
        correlation = dummies[column].corr(dummies[dummy_column])
        correlations[column][dummy_column] = correlation

# Print top 5 correlations for each eventlevel_trip
for event_level, corr in correlations.items():
    sorted_corr = sorted(corr.items(), key=lambda x: abs(x[1]), reverse=True)[:5]
    print(f"Top 5 correlations for {event_level}:")
    for dummy, correlation in sorted_corr:
        print(f"{dummy}: {correlation}")
    print()

Top 5 correlations for vector_Ambulance:
eventtype_P011 - Chest pain: -0.12970625473470945
eventtype_P019 - Unconscious - syncope: -0.10366833558599367
eventtype_P026 - Unclear problem: 0.09345632161146063
eventtype_P015 - Epilepsy - convulsions: -0.08428175683624625
eventtype_P099 - Interhospital transport: -0.07805497899706929

Top 5 correlations for vector_Ambulance Disaster:
eventtype_P090 - Test MED: 0.03462675847454622
eventtype_P020 - Intoxication alcohol: 0.00555827632482904
eventtype_P006 - Burns: 0.0038305271241210097
eventtype_P033 - Trauma: 0.0031843605702535525
eventtype_P063 - Eye problems: 0.0031591898493447917

Top 5 correlations for vector_Ambulance Event:
eventtype_P096 - Out of service: 0.02670418886830067
eventtype_P090 - Test MED: 0.025913875271851947
eventtype_P020 - Intoxication alcohol: 0.023022407031443915
eventtype_P021 - Intoxication drugs: 0.01363529089761092
eventtype_P033 - Trauma: 0.011169711421737588

Top 5 correlations for vector_Ambulance Exceptional:


In [7]:
import numpy as np

# 1. Create t0_NameDay_Hour from t0_NameDay and t0_Hour
interventions_dataset['t0_DayName_Hour'] = interventions_dataset['t0_DayName'] + '_' + interventions_dataset['t0_Hour'].astype(str)

# 2. Create t0_Month_Day from t0_Month and t0_Day
interventions_dataset['t0_Month_Day'] = interventions_dataset['t0_Month'].astype(str) + '_' + interventions_dataset['t0_Day'].astype(str)

# 3. Convert eventtype_trip to dummy variables
eventtype_trip_dummies = pd.get_dummies(interventions_dataset['eventtype_trip'], prefix='eventtype')

# 4. Calculate correlations
correlations = {}
for column in ['t0_Hour', 't0_Day', 't0_Month', 't0_DayName', 't0_DayName_Hour', 't0_Month_Day']:
    correlations[column] = {}
    for dummy_column in eventtype_trip_dummies.columns:
        # Check if both columns contain numeric data
        if pd.api.types.is_numeric_dtype(eventtype_trip_dummies[dummy_column]) and pd.api.types.is_numeric_dtype(interventions_dataset[column]):
            # Calculate correlation
            correlations[column][dummy_column] = eventtype_trip_dummies[dummy_column].corr(interventions_dataset[column])
        else:
            # If one or both columns contain non-numeric data, assign NaN
            correlations[column][dummy_column] = np.nan

# 5. Show top 10 correlations for each column
for column, corr in correlations.items():
    top_10_corr = pd.Series(corr).abs().nlargest(10)
    print(f"Top 10 correlations for {column}:")
    print(top_10_corr)


Top 10 correlations for t0_Hour:
eventtype_P031 - Psychiatric problem                     0.029755
eventtype_P022 - Intoxication medication                 0.028130
eventtype_P010 - Respiratory problems                    0.026091
eventtype_P020 - Intoxication alcohol                    0.025460
eventtype_P013 - Non-traumatic back pain                 0.023984
eventtype_P012 - Non-traumatic abdominal pain            0.019397
eventtype_P029 - Obstruction of the respiratory tract    0.018260
eventtype_P097 - Collocation (planned)                   0.017743
eventtype_P032 - Allergic reactions                      0.017625
eventtype_P067 - Social problem                          0.016892
dtype: float64
Top 10 correlations for t0_Day:
eventtype_FI B(1.4.1) fire confined space IFDP           0.003811
eventtype_P028 - Drowning - diving accident              0.003730
eventtype_Y_TI                                           0.003619
eventtype_P072 - Sick child < 15 years with fever        0.003

In [8]:
import pandas as pd
import numpy as np

# Assuming interventions_dataset is your DataFrame

# Convert 't0' column to datetime format
interventions_dataset['t0'] = pd.to_datetime(interventions_dataset['t0'])

# Create dummy variables for eventtype_trip
eventtype_trip_dummies = pd.get_dummies(interventions_dataset['eventtype_trip'], prefix='eventtype')

# Create dummy variables for t0_Month
t0_month_dummies = pd.get_dummies(interventions_dataset['t0'].dt.month, prefix='t0_Month')

# Create dummy variables for t0_Day
t0_day_dummies = pd.get_dummies(interventions_dataset['t0'].dt.day, prefix='t0_Day')

# Create dummy variables for t0_DayName_Hour
interventions_dataset['t0_DayName_Hour'] = interventions_dataset['t0'].dt.day_name() + '_' + interventions_dataset['t0'].dt.hour.astype(str)
t0_dayname_hour_dummies = pd.get_dummies(interventions_dataset['t0_DayName_Hour'], prefix='t0_DayName_Hour')

# Create dummy variables for t0_Month_Day
t0_month_day_dummies = pd.get_dummies(interventions_dataset['t0'].dt.strftime('%m_%d'), prefix='t0_Month_Day')

# Concatenate all dummy variables
dummies = pd.concat([eventtype_trip_dummies, t0_month_dummies, t0_day_dummies, t0_dayname_hour_dummies, t0_month_day_dummies], axis=1)

# Calculate correlations between time dummy variables and eventtype_trip dummy variables
correlations = {}
for column in eventtype_trip_dummies.columns:
    correlations[column] = {}
    for dummy_column in dummies.columns:
        # Check if both columns contain numeric data
        if pd.api.types.is_numeric_dtype(dummies[dummy_column]) and pd.api.types.is_numeric_dtype(dummies[column]):
            # Calculate correlation
            correlations[column][dummy_column] = dummies[dummy_column].corr(dummies[column])
        else:
            # If one or both columns contain non-numeric data, assign NaN
            correlations[column][dummy_column] = np.nan

# Show top 10 correlations for each column
for column, corr in correlations.items():
    top_10_corr = pd.Series(corr).abs().nlargest(10)
    print(f"Top 10 correlations for {column}:")
    print(top_10_corr)


Top 10 correlations for eventtype_112:
eventtype_112                   1.000000
t0_Month_Day_04_26              0.005499
t0_Month_Day_08_25              0.005423
t0_Month_Day_03_14              0.005377
t0_Month_Day_09_26              0.005375
t0_DayName_Hour_Thursday_2.0    0.005348
t0_Month_Day_01_26              0.005229
t0_Month_Day_01_23              0.005202
t0_Month_Day_10_23              0.005187
t0_Month_Day_09_23              0.005162
dtype: float64
Top 10 correlations for eventtype_ALGEMENE:
eventtype_ALGEMENE                1.000000
t0_Month_Day_03_15                0.006336
t0_Month_Day_06_12                0.006008
t0_Month_Day_11_26                0.005917
t0_DayName_Hour_Monday_7.0        0.004117
t0_DayName_Hour_Wednesday_19.0    0.003574
t0_DayName_Hour_Saturday_16.0     0.003392
t0_DayName_Hour_Sunday_19.0       0.003367
t0_Month_Day_04_10                0.003137
t0_Month_Day_11_03                0.003128
dtype: float64
Top 10 correlations for eventtype_FI (1.11.0) f

In [9]:
import pandas as pd
import numpy as np

# Assuming interventions_dataset is your DataFrame

# Convert 't0' column to datetime format
interventions_dataset['t0'] = pd.to_datetime(interventions_dataset['t0'])

# Create dummy variables for eventtype_trip
eventtype_trip_dummies = pd.get_dummies(interventions_dataset['eventtype_trip'], prefix='eventtype')

# Create dummy variables for t0_Month
t0_month_dummies = pd.get_dummies(interventions_dataset['t0'].dt.month, prefix='t0_Month')

# Create dummy variables for t0_Day
t0_day_dummies = pd.get_dummies(interventions_dataset['t0'].dt.day, prefix='t0_Day')

# Create dummy variables for t0_DayName_Hour
interventions_dataset['t0_DayName_Hour'] = interventions_dataset['t0'].dt.day_name() + '_' + interventions_dataset['t0'].dt.hour.astype(str)
t0_dayname_hour_dummies = pd.get_dummies(interventions_dataset['t0_DayName_Hour'], prefix='t0_DayName_Hour')

# Create dummy variables for t0_Month_Day
t0_month_day_dummies = pd.get_dummies(interventions_dataset['t0'].dt.strftime('%m_%d'), prefix='t0_Month_Day')

# Concatenate all dummy variables
dummies = pd.concat([eventtype_trip_dummies, t0_month_dummies, t0_day_dummies, t0_dayname_hour_dummies, t0_month_day_dummies], axis=1)

# Calculate correlations between time dummy variables and eventtype_trip dummy variables
correlations = {}
for column in eventtype_trip_dummies.columns:
    correlations[column] = {}
    for dummy_column in dummies.columns:
        # Exclude comparisons between dummy variables of the same category
        if not (column.startswith('eventtype_') and dummy_column.startswith('eventtype_')):
            # Check if both columns contain numeric data
            if pd.api.types.is_numeric_dtype(dummies[dummy_column]) and pd.api.types.is_numeric_dtype(dummies[column]):
                # Calculate correlation
                correlation = dummies[dummy_column].corr(dummies[column])
                if abs(correlation) > 0.01:
                    correlations[column][dummy_column] = correlation
            else:
                # If one or both columns contain non-numeric data, assign NaN
                correlations[column][dummy_column] = np.nan

# Show correlations above 0.01 for each column
for column, corr in correlations.items():
    if corr:
        top_corr = pd.Series(corr).abs().nlargest(10)
        print(f"Top 10 correlations for {column}:")
        print(top_corr)


Top 10 correlations for eventtype_FI (1.11.0) fire train/tram/metro:
t0_Month_Day_09_20              0.019170
t0_DayName_Hour_Tuesday_16.0    0.011106
dtype: float64
Top 10 correlations for eventtype_FI (1.3.0) fire building:
t0_Month_7.0    0.010894
t0_Month_8.0    0.010818
t0_Month_6.0    0.010720
dtype: float64
Top 10 correlations for eventtype_FI (1.3.1) fire high voltage:
t0_Month_Day_02_09                0.018605
t0_Month_Day_02_15                0.018563
t0_DayName_Hour_Thursday_17.0     0.011169
t0_DayName_Hour_Wednesday_13.0    0.011028
dtype: float64
Top 10 correlations for eventtype_FI (1.4.0) fire confined space:
t0_Month_Day_01_18    0.012201
dtype: float64
Top 10 correlations for eventtype_FI (1.5.0) fire odour/check:
t0_Month_Day_03_31             0.019317
t0_DayName_Hour_Friday_17.0    0.011266
dtype: float64
Top 10 correlations for eventtype_FI (1.7.0) fire industry:
t0_Month_Day_03_31             0.027318
t0_DayName_Hour_Friday_17.0    0.015933
t0_Day_31.0            

In [11]:
# Make dummies for categorical variables
df_dummies = pd.get_dummies(interventions_dataset, columns=['vector_type', 'eventtype_firstcall', 'eventlevel_firstcall', 'eventtype_trip', 'eventlevel_trip', 
                                                            'province_intervention', 'number_of_transported_persons', 'abandon_reason'])

# Assuming df_interventions1 is your DataFrame
correlation_matrix = df_dummies.corr()

# Filter the correlation matrix to include only correlations higher than 0.15 but lower than 1
filtered_correlation_matrix = correlation_matrix[
    (correlation_matrix > 0.2) & (correlation_matrix < 1)
]

# Create a list to store the correlation pairs and values
correlation_list = []

# Iterate over rows and columns of the filtered correlation matrix
for col in filtered_correlation_matrix.columns:
    for index, value in filtered_correlation_matrix[col].items():
        # Check if the correlation value is not NaN and append to the list
        if not pd.isnull(value):
            if value > 0.2 and value < 1:
                correlation_list.append((col, index, value))

# Sort the correlation list based on correlation values in descending order
correlation_list.sort(key=lambda x: x[2], reverse=True)

# Print the ranked correlations
for pair in correlation_list:
    print(f"{pair[0]} - {pair[1]}: {pair[2]}")

  correlation_matrix = df_dummies.corr()


t0_Month - t7_Month: 0.9987145619761723
t7_Month - t0_Month: 0.9987145619761723
longitude_permanence - longitude_intervention: 0.9949822546641864
longitude_intervention - longitude_permanence: 0.9949822546641864
latitude_permanence - latitude_intervention: 0.9918798187957619
latitude_intervention - latitude_permanence: 0.9918798187957619
eventtype_firstcall_P035 - Convulsions child < 7 years old - eventtype_trip_P035 - Convulsions child < 7 years old: 0.9898003584962839
eventtype_trip_P035 - Convulsions child < 7 years old - eventtype_firstcall_P035 - Convulsions child < 7 years old: 0.9898003584962839
intervention_time_(t1reported) - unavailable_time: 0.9845706710350748
unavailable_time - intervention_time_(t1reported): 0.9845706710350748
waiting_time - unavailable_time: 0.981745200818155
unavailable_time - waiting_time: 0.981745200818155
eventtype_firstcall_P090 - Test MED - eventtype_trip_P090 - Test MED: 0.9798770396811508
eventtype_trip_P090 - Test MED - eventtype_firstcall_P090 -

In [12]:
import pandas as pd

# Assuming df_interventions1 is your DataFrame
selected_columns = ['vector_type', 'eventtype_trip']

# Selecting the relevant columns
df_selected = interventions_dataset[selected_columns]

# Encoding categorical variables using one-hot encoding
df_encoded = pd.get_dummies(df_selected)

# Calculating the correlation matrix
correlation_matrix = df_encoded.corr()

# Selecting the correlation values for vector_type_MUG and the dummy variables of eventtype_trip
mug_correlation = correlation_matrix['vector_type_MUG'].loc[df_encoded.columns[df_encoded.columns.str.startswith('eventtype_trip')]]

# Sorting the correlations in descending order
top_5_correlations = mug_correlation.sort_values(ascending=False).head(5)

# Create a DataFrame to display the results
top_5_correlations_df = pd.DataFrame({'Eventtype_Trip': top_5_correlations.index, 'Correlation': top_5_correlations.values})

# Display the DataFrame
print(top_5_correlations_df)


                                  Eventtype_Trip  Correlation
0               eventtype_trip_P011 - Chest pain     0.145019
1    eventtype_trip_P019 - Unconscious - syncope     0.121500
2           eventtype_trip_P003 - Cardiac arrest     0.104183
3   eventtype_trip_P015 - Epilepsy - convulsions     0.088069
4  eventtype_trip_P099 - Interhospital transport     0.062310


In [13]:
import pandas as pd

# Assuming df_interventions1 is your DataFrame
selected_columns = ['vector_type', 'eventtype_trip']

# Selecting the relevant columns
df_selected = interventions_dataset[selected_columns]

# Encoding categorical variables using one-hot encoding
df_encoded = pd.get_dummies(df_selected)

# Calculating the correlation matrix
correlation_matrix = df_encoded.corr()

# Selecting the correlation values for vector_type_MUG and the dummy variables of eventtype_trip
mug_correlation = correlation_matrix['vector_type_Brandziekenwagen'].loc[df_encoded.columns[df_encoded.columns.str.startswith('eventtype_trip')]]

# Sorting the correlations in descending order
top_5_correlations = mug_correlation.sort_values(ascending=False).head(5)

# Create a DataFrame to display the results
top_5_correlations_df = pd.DataFrame({'Eventtype_Trip': top_5_correlations.index, 'Correlation': top_5_correlations.values})

# Display the DataFrame
print(top_5_correlations_df)

                                      Eventtype_Trip  Correlation
0            eventtype_trip_FI (1.3.0) fire building     0.416898
1                eventtype_trip_HG (2.1.1) gas odour     0.315336
2                 eventtype_trip_HG (2.1.2) gas leak     0.242792
3          eventtype_trip_TI (3.3.2) CO intoxication     0.232571
4  eventtype_trip_TI (3.3.3) rescue from danger t...     0.112500


In [14]:
print(interventions_dataset.columns)

Index(['mission_id', 'service_name', 'postalcode_permanence',
       'cityname_permanence', 'streetname_permanence',
       'housenumber_permanence', 'latitude_permanence', 'longitude_permanence',
       'permanence_short_name', 'permanence_long_name', 'vector_type',
       'eventtype_firstcall', 'eventlevel_firstcall', 'eventtype_trip',
       'eventlevel_trip', 'postalcode_intervention', 'cityname_intervention',
       'latitude_intervention', 'longitude_intervention', 't0', 't1',
       't1confirmed', 't2', 't3', 't4', 't5', 't6', 't7', 't9',
       'intervention_time_(t1reported)', 'waiting_time',
       'intervention_duration', 'departure_time_(t1reported)',
       'unavailable_time', 'name_destination_hospital',
       'postalcode_destination_hospital', 'cityname_destination_hospital',
       'streetname_destination_hospital', 'housenumber_destination_hospital',
       'calculated_traveltime_destinatio', 'calculated_distance_destination',
       'number_of_transported_persons', '

In [15]:
import pandas as pd

# Filter dataset to include only the relevant columns
columns_of_interest = ['t0_Month', 't0_DayName', 'eventtype_trip']
filtered_df = interventions_dataset[columns_of_interest]

# One-hot encode EventType Trip
filtered_df = pd.get_dummies(filtered_df, columns=['eventtype_trip'])

# Calculate correlation matrix
correlation_matrix = filtered_df.corr()

# Extract top 20 highest correlations
correlation_pairs = correlation_matrix.unstack().sort_values(ascending=False)
top_correlations = correlation_pairs[correlation_pairs < 1].head(20)

print("Top 20 highest correlations between month, and weekday vs event type:")
print(top_correlations)

  correlation_matrix = filtered_df.corr()


Top 20 highest correlations between month, and weekday vs event type:
eventtype_trip_P080 - COVID-19                                          t0_Month                                                                  0.016734
t0_Month                                                                eventtype_trip_P080 - COVID-19                                            0.016734
eventtype_trip_P072 - Sick child < 15 years with fever                  t0_Month                                                                  0.010740
t0_Month                                                                eventtype_trip_P072 - Sick child < 15 years with fever                    0.010740
                                                                        eventtype_trip_P073 - Sick child < 15 years with respiratory infection    0.007143
eventtype_trip_P073 - Sick child < 15 years with respiratory infection  t0_Month                                                                  0.007143


In [16]:
import pandas as pd

# Create dummy variables for event types
dummy_variables = pd.get_dummies(interventions_dataset['eventtype_trip'], prefix='event_type')

# Concatenate the original DataFrame with the dummy variables
interventions_dataset = pd.concat([interventions_dataset, dummy_variables], axis=1)

# Create a list of column names representing the dummy variables for event types
list_of_dummy_variables = dummy_variables.columns.tolist()

# Create a DataFrame containing only the relevant columns
relevant_columns = interventions_dataset[['t0_Hour', 't0_Day', 't0_Month'] + list_of_dummy_variables]

# Calculate the correlation matrix
correlation_matrix = relevant_columns.corr()

# Extract the correlations of t0_Hour, t0_Day, and t0_Month with the dummy variables for event types
correlations = correlation_matrix.loc[['t0_Hour', 't0_Day', 't0_Month'], list_of_dummy_variables]

# Get the absolute values of correlations
absolute_correlations = correlations.abs()

# Get the top 20 correlations
top_20_correlations = absolute_correlations.stack().nlargest(20)

print("Top 20 correlations between t0_Hour, t0_Day, t0_Month, and event types:")
print(top_20_correlations)

Top 20 correlations between t0_Hour, t0_Day, t0_Month, and event types:
t0_Hour   event_type_P031 - Psychiatric problem                                      0.029786
          event_type_P022 - Intoxication medication                                  0.028102
          event_type_P010 - Respiratory problems                                     0.026185
          event_type_P020 - Intoxication alcohol                                     0.025342
          event_type_P013 - Non-traumatic back pain                                  0.023986
          event_type_P012 - Non-traumatic abdominal pain                             0.019417
          event_type_P029 - Obstruction of the respiratory tract                     0.018270
          event_type_P032 - Allergic reactions                                       0.017699
          event_type_P097 - Collocation (planned)                                    0.017694
          event_type_P067 - Social problem                                        

In [17]:
import numpy as np

# Select the columns of interest
categorical_columns = ['vector_type', 'eventlevel_trip']

# Convert categorical variables into dummy variables
dummy_variables = pd.get_dummies(interventions_dataset[categorical_columns])

# Compute the correlation matrix
correlation_matrix = dummy_variables.corr()

# Filter out one of the pairs from the upper triangular part of the correlation matrix
correlation_matrix = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Extract correlation values for the dummy variables of interest
correlation_values = correlation_matrix.unstack().sort_values(ascending=False)

# Remove correlation values of 1 (correlation of a variable with itself) and convert to DataFrame
correlation_df = pd.DataFrame(correlation_values[correlation_values != 1], columns=['Correlation'])

# Get the top 5 highest correlations
top_20_correlations = correlation_df.head(20)

print(top_20_correlations)

                                                                            Correlation
eventlevel_trip_N5                   vector_type_Ambulance                     0.481206
eventlevel_trip_N1                   vector_type_MUG                           0.384335
eventlevel_trip_N4                   vector_type_PIT                           0.330386
eventlevel_trip_N3                   vector_type_PIT                           0.279339
eventlevel_trip_N2                   vector_type_MUG                           0.253294
eventlevel_trip_N3                   vector_type_MUG                           0.102896
eventlevel_trip_N0                   vector_type_MUG                           0.051595
eventlevel_trip_N3                   vector_type_Ambulance Exceptional         0.034914
eventlevel_trip_N6                   vector_type_Ambulance                     0.028088
eventlevel_trip_Buitendienststelling vector_type_MUG                           0.018055
eventlevel_trip_N1              

In [3]:
# Get dummy variables for province_intervention and eventtype_trip
province_dummies = pd.get_dummies(interventions_dataset['province_intervention'])
eventtype_dummies = pd.get_dummies(interventions_dataset['eventtype_trip'])

# Calculate correlations
correlations = pd.DataFrame(index=province_dummies.columns, columns=eventtype_dummies.columns)
for province_column in province_dummies.columns:
    for eventtype_column in eventtype_dummies.columns:
        correlation = province_dummies[province_column].corr(eventtype_dummies[eventtype_column])
        correlations.loc[province_column, eventtype_column] = correlation

# Get top 10 correlations for each dummy variable of province_intervention
top_correlations = {}
for province_column in correlations.index:
    top_correlations[province_column] = correlations.loc[province_column].sort_values(ascending=False).head(10)

# Print top 10 correlations for each dummy variable of province_intervention
for province_column, top_corr in top_correlations.items():
    print(f"Top 10 correlations for {province_column}:")
    print(top_corr)
    print()

Top 10 correlations for ANT:
P099 - Interhospital transport                   0.033762
P033 - Trauma                                    0.021781
P001 - Traffic accident                          0.021497
P095 - Preventive ambulance                      0.020297
P003 - Cardiac arrest                             0.01965
P096 - Out of service                            0.017052
P097 - Collocation (planned)                     0.013344
P008 - Patient with defibrillator - pacemaker    0.010869
P028 - Drowning - diving accident                0.010744
P007 - Fall from great height (> 3 meters)       0.009522
Name: ANT, dtype: object

Top 10 correlations for BRW:
P067 - Social problem                     0.032333
P026 - Unclear problem                    0.022136
P069 - Wounds                             0.020927
P095 - Preventive ambulance               0.013654
P096 - Out of service                     0.011624
P038 - Person does not answer the call    0.007741
P011 - Chest pain             