In [1]:
import psycopg2
import pandas as pd
import csv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [2]:
def connect_to_database():
    db_params = {
    'host': '194.171.191.226',
    'port': '6379',
    'database': 'postgres',
    'user': 'group6',
    'password': 'blockd_2024group6_79'
    }
    try:
        conn_psycopg2 = psycopg2.connect(**db_params)
        print('Connection was successful!')
        return conn_psycopg2
    except Exception as e:
        print('Connection was not successful!')
        print(e)

    

In [3]:

def create_cursor(connection):
    return connection.cursor()

def close_cursor(cursor):
    cursor.close()

def close_connection(connection):
    connection.close()

In [4]:
def init_database_connection(func):
    
    
    
    def wrapper(*args):
        connection = connect_to_database()

        cursor = create_cursor(connection)
        res =func(cursor ,*args)

        close_cursor(cursor)
        close_connection(connection)
        return res

    return wrapper

In [5]:
@init_database_connection
def show_select_query_results(cursor, query, show_results = False):
   
    cursor.execute(query)
 
        
    rows = cursor.fetchall()

    if show_results:
        print('Results are here' ,rows )
        for row in rows:
            print(row)
        
    return rows
    

In [6]:

def get_column_names(table_name):
        q = f'''
    SELECT COLUMN_NAME
    FROM information_schema.columns
    WHERE table_schema ='data_lake'
    AND table_name ='{table_name}'
    ORDER BY ordinal_position
    '''
        return np.array(show_select_query_results(q)).flatten()
    

In [7]:
def load_sql_to_df(table_name):
    col_names = get_column_names(table_name)

    fetch_query = f'''
     SELECT * FROM data_lake.{table_name}
     ;

    '''

    result = show_select_query_results( fetch_query)

    return pd.DataFrame( columns = col_names.tolist() , data  = result)
    

In [8]:

safe_driving_df = load_sql_to_df('safe_driving')

We will ensure categorical values do not have uneccesary whitespace or other unexpected special characters, moreover, we will unifiy value casing to lower


In [9]:


    
  

def clean_categorical_data(df):
     string_cols =  [ col  for col in df.columns  if 'object' == str(df[col].dtype)]

     for col in string_cols:
         df[col] = df[col].str.strip()
         df[col] = df[col].str.replace(r'[^\w\s]', '', regex=True)
         df[col] = df[col].str.lower()

     for col in string_cols:
        df = df.rename(columns={col: str(col).lower().replace(' ', '_')})


          
     return df

    

    


In [10]:
safe_driving_df = clean_categorical_data(safe_driving_df)

In [11]:
#### checking missing values in dataset 

def print_line_break():
    print('=' * 20 )
    print(' ' + '-' * 18 + ' ')
    print('=' * 20 )

def show_dataframe_general_info(df):
    display('General info of df')
    display(df.info())
    display('Description of df')
    display(df.describe())
    
    check_df_missing_values(df)

    

def check_df_missing_values(df):
    total_missing_values = df.isna().sum().sum()
    display(f'Total number of missing values: ', total_missing_values)
    
    if total_missing_values > 0 :
        
        display('Number of missing values in particular columns: ')
        display(df.isna().sum())


def show_value_counts(df,col):
    display(f'Value counts of {col}')
    sorted_val_df = df[col].value_counts().sort_values(ascending = False)
    
    if sorted_val_df.shape[0] > 6:
        sorted_val_df = sorted_val_df.iloc[:6]
    display(sorted_val_df)

    

    print_line_break()
    
    display(f'Least used values in {col} column: ')
    
    display(df[col].value_counts().sort_values(ascending = True).iloc[:5])

    unique_vals_in_col = len(pd.unique(df[col]))

    col_dtype   = str(df[col].dtype)

    

    if col_dtype.startswith('int') or col_dtype.startswith('float'):
        fig, ax  = plt.subplots(figsize =(14, 8))
        sns.boxplot(x = col, data =df , ax =  ax)
        plt.show()
    elif  unique_vals_in_col < 20 and  col_dtype.startswith('object'):
        fig, ax  = plt.subplots(figsize =(18, 8))
        
        missing_vals = df[col].isna().sum()

        if missing_vals > 0:
             ax.axhline(y  = missing_vals , color ='r' , linestyle='--' , linewidth = 2 , label='Missing values in df')
             ax.legend()
             sns.countplot( x = col , data = df.replace({np.nan:'unknown'}) , ax  =ax )
        sns.countplot( x = col , data = df , ax  =ax )
        
        plt.show()
 
        

 
                
    

def show_dataframe_column_value_counts(df):
    cols = df.columns

    

    for col in cols:
        print_line_break()
        show_value_counts(df,col)
        missing_vals_in_col =df[col].isna().sum() 
        
        if  missing_vals_in_col > 0:
            display(f'Missing values in {col}')
            display(f'{col}: {missing_vals_in_col}')
        



    

In [12]:
show_dataframe_general_info(safe_driving_df)

In [13]:
show_dataframe_column_value_counts(safe_driving_df)

In [14]:
def show_duplicated_values_in_column(df , col_name):
    display("Show duplicated values in column: {col_name}")
    total_duplicated_values  = df[col_name].duplicated().sum()
    if total_duplicated_values > 0:
        
        display(f'Duplicated values in {col_name} :')
        display('Number of duplicated values / all rows')
        duplicated_values_perc = round((total_duplicated_values/df[col_name].shape[0]* 100) , 2)
        
        display(f'{total_duplicated_values}/{df[col_name].shape[0]} :  which is around {duplicated_values_perc}%')
        sorted_val_df = df[col_name].value_counts().sort_values(ascending = False)
        sorted_val_df = sorted_val_df[sorted_val_df > 1]

        
        display(sorted_val_df)

        duplicated_values = sorted_val_df.reset_index()[col_name]



        display('Show duplicated column rows :')
        display(df[df[col_name].isin(duplicated_values.to_list())])
    else:
        display('No duplicated values in this column !!!')
        



In [15]:
def show_general_duplicate_values(df,col_name= None):

    if col_name is not None:
        
        show_duplicated_values_in_column(df,col_name)
    else:
        total_duplicated_values  = df.duplicated().sum()
        if total_duplicated_values > 0:
            
            display(f'Duplicated values in df:')
            display('Number of duplicated values / all rows')
            duplicated_values_perc = round((total_duplicated_values/df.shape[0]* 100) , 2)
    
            display(f'{total_duplicated_values}/{df.shape[0]} :  which is around {duplicated_values_perc}%')
        else:
            display('No duplicated values in this dataframe !!!')




def drop_duplicates_in_df(df,columns):
   
    drop_duplicated = False

    if len(columns) > 1:
        for col in columns:
            display('Duplicated values in {columns} after dropping them')
            print_line_break()
            drop_duplicated = df.drop_duplicates(subset=[col] , inplace = True)
            show_duplicated_values_in_column(safe_driving_df ,col )

            print_line_break()
    else:
        display('Duplicated values in {columns} after dropping them')
        print_line_break()
        drop_duplicated = df.drop_duplicates(subset=[*columns] , inplace = True)
        show_duplicated_values_in_column(safe_driving_df ,columns[0] )
    
    return drop_duplicated
    

In [16]:
show_general_duplicate_values(safe_driving_df)

In [17]:
show_duplicated_values_in_column(safe_driving_df , 'eventid')

In [18]:
show_duplicated_values_in_column(safe_driving_df , 'event_start')

Since most of the duplicated id constitute the similar or the same accidents and the fraction of duplicated values is relatively insignificant, the duplicated rows will be dropped

In [19]:
drop_duplicates_in_df(safe_driving_df, ['eventid','event_start'])

Now I will proceed to examine outliers 


In [20]:
safe_driving_df.dtypes

In [21]:
def plot_columns(df,columns , plot):
    if len(columns) == 0:
        display('No columns to plot')
        print_line_break()
        print_line_break()
        return

    cols_length  = len(columns)
    
  
    fig, axes = plt.subplots(nrows = cols_length , ncols = 1,figsize=(12, cols_length * 6) , sharex=False, sharey=False)
    print(axes , type(axes))

    if not isinstance(axes,np.ndarray):
        axes = np.array([axes])
        
    for idx,  current_ax in enumerate(axes.flatten()):
        if idx < len(columns):
            
          
            current_col = columns[idx]
            
            current_ax.set_title(f'Column: {current_col}')
            
            plot(x = df[current_col] , ax = current_ax )

    plt.show()
            
    

def plot_numeric_columns(df, columns):
    plot_columns(df,columns,sns.boxplot)

  
def plot_string_columns(df,columns):
    plot_columns(df,columns,sns.countplot)

def plot_bool_columns(df,columns):
    plot_columns(df,columns,sns.countplot)


def plot_value_distributions_in_df(df , columns_to_avoid = [] ):
    
    numeric_cols = [ col  for col in df.columns  if ( 'float' in str(df[col].dtype) or 'int' in str(df[col].dtype)) and  col not in columns_to_avoid ]

    string_cols =   [ col  for col in df.columns  if 'object' == str(df[col].dtype)  and  col not in columns_to_avoid ]

    bool_cols =   [ col  for col in df.columns  if 'bool' == str(df[col].dtype)  and  col not in columns_to_avoid ]

    if numeric_cols:
        

        display('Numerical columns plotted :')
        plot_numeric_columns(df, numeric_cols)
        print_line_break()
        print_line_break()
        print_line_break()

    if string_cols:
        
        display('String columns plotted :')
        plot_string_columns(df, string_cols)
    
        print_line_break()
        print_line_break()
        print_line_break()

    if bool_cols:
        
        
        display('Bool columns plotted :')
        plot_bool_columns(df,bool_cols)




    

    

In [22]:
plot_value_distributions_in_df(safe_driving_df , ['eventid', 'road_segment_id', 'latitude', 'longitude'])

In [23]:
def show_outliers_fraction(df, col , Q1,Q3,IQR):
        print_line_break()
        display(f'The fraction of outliers in {col}')
        total_outliers_number_in_col_mask =  (df[col] < Q1 - 1.5 * IQR) | (df[col] >  Q3 + 1.5 * IQR)
        total_outliers_number_in_col  = df[total_outliers_number_in_col_mask].shape[0]
        if total_outliers_number_in_col   <=0:
            display(f'No outliers detected in {col} column')
            return

        print(total_outliers_number_in_col)
        total_outliers_number_in_col_perc =  round( (total_outliers_number_in_col / df.shape[0])  , 2 ) * 100
        display(f'{total_outliers_number_in_col}  / {df.shape[0]} which is around {total_outliers_number_in_col_perc}%')
        print_line_break()

def delete_outliers(df , columns , multiplier = 1.5):
    df_no_outliers = df.copy()
    
    for col in columns:
        Q1 = df_no_outliers[col].quantile(0.25)
        Q3 = df_no_outliers[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - (IQR * multiplier)
        upper_bound = Q3 + (IQR * multiplier)

        show_outliers_fraction(df, col, Q1, Q3, IQR)
        print(f"{col}: Q1={Q1}, Q3={Q3}, IQR={IQR}, Lower Bound={lower_bound}, Upper Bound={upper_bound}")

        df_no_outliers = df_no_outliers[ (df_no_outliers[col] >=lower_bound) & (df_no_outliers[col] <= upper_bound)]

    

    return df_no_outliers

Due to low amount of outliers located in dataset they will be removed using IQR and quantiles lower 0.25 and greater than 0.75

In [24]:
safe_driving_df = delete_outliers(safe_driving_df , ['end_speed_kmh' , 'speed_kmh' , 'duration_seconds'])

In [25]:
plot_value_distributions_in_df(safe_driving_df , ['eventid', 'road_segment_id', 'latitude', 'longitude'])

is_valid , road_manager_type, road_number,road_manager_name, municipality_name columns does not provide much value therefore they will be dropped


In [26]:
def drop_columns_in_df(df, columns_to_drop):
    cols_drop_len  = len(columns_to_drop)

    for col_to_drop in columns_to_drop:
        if col_to_drop in df.columns:
            
            df.drop(columns=[col_to_drop] , inplace=True)

In [27]:

drop_columns_in_df(safe_driving_df , ['is_valid', 'road_manager_type', 'road_number' , 'road_manager_name', 'municipality_name' , 'place_name'])

### Let's simplify incident_severity column

In [28]:

def convert_column_to_binary(df, columns_with_new_values):
    for key, val  in columns_with_new_values.items():
        col = key
        multiple_values = val['top_values']
       
        new_replace_value = val['new_value']
        
        most_frequent_values = df[col].value_counts().index[0:multiple_values]
        
        df[col] = df[col].apply(lambda row: row if str(row) in most_frequent_values else new_replace_value)


In [29]:
columns_with_new_values_dict = {
    'incident_severity':{
        'new_value':'other incident severities',
         'top_values':2
    },

}

convert_column_to_binary(safe_driving_df, columns_with_new_values_dict)

In [30]:
plot_value_distributions_in_df(safe_driving_df , ['eventid', 'road_segment_id', 'latitude', 'longitude'])

Let's proceed with data inconsistencies 

In [31]:
safe_driving_df['incident_severity'].value_counts()

In [32]:
def clip_numerical_cols(df,columns):
    for col in columns:
        df[col] = df[col].round(2)


def clean_numerical_cols(df):
    numeric_cols = [ col  for col in df.columns  if ( 'float' in str(df[col].dtype) or 'int' in str(df[col].dtype)) ]
    
    for col in numeric_cols:
        df[col] = df[col].abs()
        df[col] = df[col].astype(float)
    

In [33]:
clean_numerical_cols(safe_driving_df)

In [34]:
clip_numerical_cols(safe_driving_df,  ['speed_kmh' , 'end_speed_kmh', 'maxwaarde' ])

There were some cases where initial speed_kmh was 0, we will analyze that



In [35]:
safe_driving_df[safe_driving_df['speed_kmh'] == 0.0]

These cases have reasonable explanation caused by Accelerating therefore these rows will not be removed

In [36]:
safe_driving_df.describe()

Now we will prooced with scaling data using Standard Scaler from Sklearn


In [37]:
def scale_numerical_data(df , columns):

    for col in columns:
        scaler = StandardScaler()
        df[col] = scaler.fit_transform(df[[col]])

In [38]:
scale_numerical_data(safe_driving_df , ['duration_seconds', 'speed_kmh', 'end_speed_kmh', 'maxwaarde'])


In [39]:
safe_driving_df.head(10)

Categorical encoding will be left for modelling process therefore no decoding functions will be implemented currently


Lets import weather informations


In [40]:
safe_driving_df = safe_driving_df.sort_values(by=['event_start'])
safe_driving_df = safe_driving_df.iloc[:8000,:]

In [41]:
def import_weather_df(table_name):
    df = load_sql_to_df(table_name)
    df = df.set_index('dtg').loc['2018-01-01':, :]
    df = df.sort_index()
    df = df.iloc[:10000,:]
    
    return df

In [42]:
wind_df = import_weather_df('wind')

No we will check basic column data distribution of ff sensor 10 


In [43]:
wind_df['ff_sensor_10'].value_counts()

In [44]:
sns.boxplot(x = wind_df['ff_sensor_10'])

In [45]:
import numpy as np
import pandas as pd

def calculate_average_value_optimized(accident_event_start, data_df, value_column):
    # Adjust time range to numpy datetime64 for precise and efficient comparison
    
    time_interval_start = np.datetime64(accident_event_start - pd.Timedelta(hours=1))
    accident_event_start = np.datetime64(accident_event_start)
    
    # Since 'dtg' is the index, we use .index for filtering
    mask = (data_df.index >= time_interval_start) & (data_df.index <= accident_event_start)
    filtered_values = data_df.loc[mask, value_column]  # Efficient filtering using .loc

    # Calculate the mean using numpy to ensure minimal overhead, directly from Pandas series

    
   
    return round(filtered_values.mean(),2) if not filtered_values.empty else np.nan

def calculate_weather_statistics_optimized(weather_df, value_column, driving_df_values , new_value_column_name):
    # Utilize list comprehension for efficient processing
    
    average_data = [
        {'eventid': row_values[0], 'dtg': row_values[1], new_value_column_name: calculate_average_value_optimized(row_values[1], weather_df, value_column)}
        for _, row_values in driving_df_values.iterrows()
    ]
    return pd.DataFrame(average_data)




In [46]:
def merge_driving_with_weather_df(driving_df, weather_df,  old_value_column_name, new_value_column_name , on ='eventid', how='left'):
    
    average_weather_df = calculate_weather_statistics_optimized(weather_df, old_value_column_name, driving_df.loc[: , ['eventid' , 'event_start']] , new_value_column_name)

    driving_df = pd.merge(driving_df, average_weather_df.loc[:,['eventid' , new_value_column_name]], on='eventid', how='left')

    return driving_df

In [47]:
safe_driving_df = merge_driving_with_weather_df(safe_driving_df ,wind_df, 'ff_sensor_10',    'last_hour_wind_avg'   )


In [48]:
safe_driving_df.loc[: , ['event_start', 'last_hour_wind_avg']].describe()

In [49]:
temp_df = import_weather_df('temperature')


In [50]:
temp_df.describe()

In [51]:

safe_driving_df = merge_driving_with_weather_df(safe_driving_df ,temp_df, 't_dryb_10',    'last_hour_temp_avg' )


In [52]:
prec_df = import_weather_df('precipitation')



In [53]:

safe_driving_df = merge_driving_with_weather_df(safe_driving_df ,prec_df, 'ri_pws_10',    'last_hour_rain_avg' )


In [54]:
safe_driving_df.head(10)

In [55]:
def scale_numerical_data(df , columns):

    for col in columns:
        scaler = StandardScaler()
        df[col] = scaler.fit_transform(df[[col]])
        

In [56]:
scale_numerical_data(safe_driving_df , ['last_hour_wind_avg', 'last_hour_temp_avg', 'last_hour_rain_avg'])


In [57]:
safe_driving_df.head(10)

In [58]:
safe_driving_df.shape

### Import the dataset 

In [59]:
accidents_17_23_df = load_sql_to_df('accident_data_17_23')

### Manage columns and transform them


In [60]:
def transform_numerical_column_to_str(df,columns):
    df.loc[:, columns] = df.loc[:,columns].astype(str)
    

### We will transform Year column to categorical column in order to make it easier to plot for now

In [61]:
transform_numerical_column_to_str( accidents_17_23_df , ['Year'])

### We will ensure categorical values do not have uneccesary whitespace or other unexpected special characters, moreover, we will unifiy value casing to lower


In [62]:
accidents_17_23_df = clean_categorical_data(accidents_17_23_df)

### We will ensure that all unknown values are converted as nan values

In [63]:
def convert_unknown_to_nan(df):
    df.replace({'unknown':np.nan}, inplace = True)

In [64]:
 convert_unknown_to_nan(accidents_17_23_df)

### In some columns  are presents empty '' values which we will convert to  nan values



In [65]:
def convert_empty_values_to_nan(df,columns):
    for col in columns:
        df[col] = df[col].replace({'':np.nan})

In [66]:
convert_empty_values_to_nan(accidents_17_23_df , ['first_mode_of_transport'])

### We will ensure that rows with insignificant rows will be dropped

In [67]:
def drop_rows_with_drop_values(df, col  , drop_values):
    
    if drop_values:
       mask = df[col].apply(lambda row: str(row) in drop_values)

       print(pd.unique(mask))
        
       idxs_to_drop = df[mask].index
       print(idxs_to_drop)
       df.drop(index = idxs_to_drop , inplace = True)

In [68]:


def convert_string_column_to_numerical(df,col ,  drop_values = []):
    drop_rows_with_drop_values(df, col, drop_values)
    
    def return_speed(row):
        
        splitted_row =     str(row).split(' ')      
        return float(splitted_row[0])
        
    df[col]  = df[col].apply(lambda row:  return_speed(row) if not pd.isnull(row) else row)



 
    

### Delete footpace homezone value from speed_limit column because it only occurs 6 times in whole df

In [69]:
convert_string_column_to_numerical(accidents_17_23_df, 'speed_limit' , drop_values = ['footpace  homezone']) 

### Drop municipality column because it has only "breda" value

In [70]:
drop_columns_in_df(accidents_17_23_df , ['municipality'  ])

### Show the columns with missing values

In [71]:
def show_columns_with_missing_values(df):
    df_cols = df.columns

    for col in df_cols:
        missing_vals_in_col  = df[col].isna().sum()
        if missing_vals_in_col > 0:
            nan_perc = round( (missing_vals_in_col / df.shape[0]) * 100  , 2 )
            print(f'Col: {col} has {missing_vals_in_col} missing values')
            print(f'Percentage of missing values / all values in column: {nan_perc } %')
            show_dataframe_column_value_counts(df[[col]])
            print_line_break()
            
            
    

In [72]:
show_columns_with_missing_values(accidents_17_23_df)

### Since every column has missing values over 15% that means that missing values constitute significant amount of important information , but because of the fact that I am not convinced how the datasets will be merged I will leave missing data imputation steps for later


In [73]:
show_general_duplicate_values(accidents_17_23_df)

### The brief analysis of dataset indicates no explicit unique identifier for each event in dataframe. Moreover, the amount of whole rows  duplicated is 0  therefore it is safe to assume that there are not any duplicates  


### Now I will proceed to examine distribution

In [74]:
plot_value_distributions_in_df(accidents_17_23_df , columns_to_avoid = [])

### Due to unequal distribution of certain columns let's convert them into binary column type

In [75]:
columns_with_new_values_dict = {
    'accident_severity':{
        'new_value':'injury or fatal',
         'top_values':1
    },
    'town':{
        'new_value':'other city',
         'top_values':1
    },

    'first_mode_of_transport':{
        'new_value':'other',
         'top_values':1
    },

   'second_mode_of_transport':{
        'new_value':'other',
         'top_values':2
    },

    'light_condition':{
        'new_value':'darkness or twilight',
         'top_values':1
    },
    'road_condition':{
        'new_value': 'wetdamp or snowblack ice',
         'top_values':1
    },
    'road_situation':{
        'new_value': 'other road situation',
         'top_values':4
    },
    'weather':{
        'new_value': 'other weather situation',
         'top_values':2
    },
    
    
    

    
}



        
        

        

In [76]:
convert_column_to_binary(accidents_17_23_df ,columns_with_new_values_dict )






Let's eleminate outliers from speed_limit by removing accidents on road with very high or very low speed limit



In [77]:
def show_dist_for_cols(df , cols , boxplot = False):
    fig , axes = plt.subplots( nrows = len(cols) ,ncols = 1 , figsize = (20 ,15))
    if len(cols) == 1:
        axes = np.array([axes])
    for idx, ax in enumerate(axes.flatten()):
    
        if boxplot:
            sns.boxplot(x=cols[idx] , ax = ax , data = df)
        else:
        
            sns.countplot(x=cols[idx], ax = ax , data = df)

    plt.show()

In [78]:
show_dist_for_cols(accidents_17_23_df , ['speed_limit' , 'accidents'] , True)

In [79]:
accidents_17_23_df = delete_outliers(accidents_17_23_df , ['speed_limit' , 'accidents'] )



In [80]:
show_dist_for_cols(accidents_17_23_df , ['speed_limit' , 'accidents'])

### Let's again present data distributions after transformations

In [81]:
plot_value_distributions_in_df(accidents_17_23_df , columns_to_avoid = [])

### Let's proceed with data inconsistencies

In [82]:
clean_numerical_cols(accidents_17_23_df) 

In [83]:
accidents_17_23_df.head(10)

### Let's see value counts of accidents



In [84]:
accidents_17_23_df['accidents'].value_counts()

### Let's scale numerical values 


In [85]:
scale_numerical_data(accidents_17_23_df , accidents_17_23_df.select_dtypes(include=['float' ,'int']))


In [86]:
accidents_17_23_df.describe()

## Given the fact that accidents number has std = 0 and now only contains value 0 after transofrmations, no longer it constains meaningful info, therefore let's drop accidents columns


In [87]:
drop_columns_in_df(accidents_17_23_df , ['accidents'])



In [88]:
accidents_17_23_df.describe()

### Categorical encoding will be left for modelling process therefore no decoding functions will be implemented currently

## Now the data needs to be analyzed

### Let's make a weighted mean of accident_severity table to make it a new column for safe_driving_df
So in our case the weighted mean of types of accidents severity will help us to assess if the street is high or low risk

In [89]:
def transform_acc_sev_col_to_encoding(df):
    df= df.copy()
        
    df = df.join(
    pd.get_dummies(df['accident_severity'] ,  dtype=float))
    return df
  

    
def w_avg(row , weights):
    w1,w2 = weights
   
    values_with_w_sum = row['injury_or_fatal_sum'] * w1 + row['material_damage_only_sum']  * w2
    
    return values_with_w_sum  / (w1 + w2) 

def calc_weighted_mean_of_acc_severity(df):
    df = df.copy()
    df = transform_acc_sev_col_to_encoding(df)

    new_df  = df.groupby(['street']).agg(injury_or_fatal_sum= ('injury or fatal', 'sum'), material_damage_only_sum = ('material damage only' , 'sum')).reset_index()

    new_df['weighted_avg'] = new_df.apply(lambda row: w_avg(row, [2,1]) , axis = 1)

    print(new_df['weighted_avg'].describe())
    return new_df


In [90]:
streets_with_accidents_ratio_df = calc_weighted_mean_of_acc_severity(accidents_17_23_df)

In [91]:
streets_with_accidents_ratio_df.head(10)

### Let's merge safe_driving_df with strees with accidents ratio 

In [92]:
safe_driving_with_accidents_df = safe_driving_df.copy().merge(streets_with_accidents_ratio_df , how='left' , left_on='road_name', right_on='street')

### This will allow to create Y variable labeling for our dataset

In [93]:
plot_value_distributions_in_df(safe_driving_with_accidents_df[['weighted_avg']] , columns_to_avoid = [])

In [94]:
safe_driving_with_accidents_df[['weighted_avg']].describe()

In [95]:
safe_driving_with_accidents_df['y_var'] = safe_driving_with_accidents_df['weighted_avg'].apply(lambda row:'low-risk'  if row <   safe_driving_with_accidents_df['weighted_avg'].mean() else 'high-risk')

### Let's drop columns which are not important after the merge

In [96]:
drop_columns_in_df(safe_driving_with_accidents_df , ['street'])


In [97]:
safe_driving_with_accidents_df.head(10)

### This is the dataframe for modelling:


In [98]:
safe_driving_with_accidents_df.head(10)

# Exploratory Data Analysis (EDA)

In [99]:
# Show general information about the dataframe
def show_dataframe_general_info(df):
    print("General info of df")
    print(df.info())
    print("Description of df")
    print(df.describe())

show_dataframe_general_info(safe_driving_df)

In [100]:
# Check for missing values
def check_df_missing_values(df):
    total_missing_values = df.isna().sum().sum()
    print(f"Total number of missing values: {total_missing_values}")
    if total_missing_values > 0:
        print("Number of missing values in particular columns:")
        print(df.isna().sum())

check_df_missing_values(safe_driving_df)

In [101]:
# Define necessary numerical features
important_numerical_features = [
    'duration_seconds', 'latitude', 'longitude', 
    'speed_kmh', 'end_speed_kmh', 'maxwaarde', 
    'last_hour_wind_avg', 'last_hour_temp_avg', 'last_hour_rain_avg'
]

In [102]:
# Temporal analysis
safe_driving_df['event_start'] = pd.to_datetime(safe_driving_df['event_start'])
safe_driving_df['event_end'] = pd.to_datetime(safe_driving_df['event_end'])

plt.figure(figsize=(12, 6))
safe_driving_df['hour'] = safe_driving_df['event_start'].dt.hour
sns.countplot(x='hour', data=safe_driving_df)
plt.title('Incidents by Hour of Day')
plt.show()

In [103]:
# Plot value distributions
def plot_value_distributions(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in important_numerical_features:
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.subplot(1, 2, 2)
        sns.boxplot(x=df[col])
        plt.title(f'Box plot of {col}')
        plt.show()

plot_value_distributions(safe_driving_df)

In [104]:
# Correlation heatmap
plt.figure(figsize=(16, 10))
correlation_matrix = safe_driving_df[important_numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [105]:
# Pair plots for important numerical features
sns.pairplot(safe_driving_df[important_numerical_features])
plt.show()

In [106]:
# Distribution of all categorical features

categorical_cols = safe_driving_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    if col == 'road_name':
        top_20_roads = safe_driving_df['road_name'].value_counts().nlargest(20).index
        sns.countplot(x=safe_driving_df[safe_driving_df['road_name'].isin(top_20_roads)]['road_name'])
        plt.title('Distribution of Top 20 Road Names')
    else:
        sns.countplot(x=safe_driving_df[col])
        plt.title(f'Distribution of {col}')
    plt.xticks(rotation=90)
    plt.show()

In [107]:
# Box plots for numerical features across all categorical features
categorical_cols = safe_driving_df.select_dtypes(include=['object']).columns

for cat_col in categorical_cols:
    for num_col in important_numerical_features:
        if num_col in safe_driving_df.columns:
            plt.figure(figsize=(12, 6))
            if cat_col == 'road_name':
                top_20_roads = safe_driving_df['road_name'].value_counts().nlargest(20).index
                sns.boxplot(x=safe_driving_df[safe_driving_df['road_name'].isin(top_20_roads)][cat_col], y=safe_driving_df[num_col])
                plt.title(f'{num_col} distribution across Top 20 {cat_col}')
            else:
                sns.boxplot(x=safe_driving_df[cat_col], y=safe_driving_df[num_col])
                plt.title(f'{num_col} distribution across {cat_col}')
            plt.xticks(rotation=90)
            plt.show()