# 1.  Imports and CSV reading with Pandas

In [63]:
import pandas as pd
import numpy as np
import re
dataframe = pd.read_csv("../data/indian_accident.csv")
dataframe

Unnamed: 0,Million Plus Cities,Cause category,Cause Subcategory,Outcome of Incident,Count
0,Agra,Traffic Control,Flashing Signal/Blinker,Greviously Injured,0.0
1,Agra,Traffic Control,Flashing Signal/Blinker,Minor Injury,0.0
2,Agra,Traffic Control,Flashing Signal/Blinker,Persons Killed,0.0
3,Agra,Traffic Control,Flashing Signal/Blinker,Total Injured,0.0
4,Agra,Traffic Control,Flashing Signal/Blinker,Total number of Accidents,0.0
...,...,...,...,...,...
9545,Vizaq,Weather,Sunny/Clear,Greviously Injured,561.0
9546,Vizaq,Weather,Sunny/Clear,Minor Injury,252.0
9547,Vizaq,Weather,Sunny/Clear,Persons Killed,176.0
9548,Vizaq,Weather,Sunny/Clear,Total number of Accidents,1207.0


# 2. Define Functions

In [104]:
def func1(x: pd.DataFrame, column_name: str):
    '''
    filter out rows if values in the given column are null values

    :param x (pd.DataFrame): Tabular data as a Pandas Dataframe
    :param column_name (string): Name of the column to filter on

    :return: output pd.DataFrame
    '''

    # axis = 0, drop rows wich contain missing values
    # how = 'any', if any null values are presente, drop the axis specified row/column(row in this case)
    # subset = [column_name], specified the column where to check null values 
    filtered_data = x.dropna(axis = 0, how = 'any', subset=[column_name])
    return filtered_data

def clean_function (string: str):
    '''
    Remove all characters that appear between parentheses

    :param string (str): string to clean
    
    :return: cleaned string
    '''
    
    return re.sub(r'\([^)]*\)', '', string).replace('(', '').replace(')', '').strip()
    
def func2(x: pd.DataFrame):
    '''
    select and return only rows in which if Cause category is equal to Traffic Control then Cause Subcategory must be equal to Others or Police Controlled.
    Moreover the function must “clean” the Million Plus Cities column of the selected rows removing all characters that appear between parentheses. 
    Remove the parentheses also.

    :param x (pd.DataFrame: Tabular data as a Pandas DataFrame

    :return: output pd.DataFrame
    '''
    
    #With this condition are selected only the rows that have  Cause category equal to Traffic Control and then Cause Subcategory equal to Others or Police Controlled /
    # 'isin' Pandas function check whether each element in the Dataframe is contained on input array values
    # in the end the 'Milion Plus Cities' column is cleaned applying a clean_function
    # another DataFrame is created because otherwise an error is reported (SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame)
    filtered_data = pd.DataFrame(x[(x['Cause category'] == 'Traffic Control') & (x['Cause Subcategory'].isin(['Others','Police Controlled']))])

    filtered_data['Million Plus Cities'] = filtered_data['Million Plus Cities'].apply(lambda x: clean_function(x))

    return filtered_data

# 3. Func1 Test

In [18]:
#First is checked what column have null values and how much does it have
print(dataframe.isnull().sum())

Million Plus Cities    0
Cause category         0
Cause Subcategory      0
Outcome of Incident    0
Count                  3
dtype: int64


In [24]:
#Secondly is checked if the application of func1, filter out the rows with null values
column_name = 'Count'
print('Before func1:', dataframe[column_name].isnull().sum())
filtered_f1 = func1(dataframe,column_name)
print('After func1:',filtered_f1[column_name].isnull().sum())

Before func1: 3
After func1: 0


In [36]:
#To ensure that the null rows have been deleted, the index of null rows in the orginal DataFrame is printed and compared with the filtered Dataframe.
print(dataframe[dataframe.isnull().any(axis=1)].index)
print(filtered_f1[filtered_f1.isnull().any(axis=1)].index)

Int64Index([7056, 7057, 7058], dtype='int64')
Int64Index([], dtype='int64')


# 4. Func2 Test

In [103]:
#the second function is applied to the dataframe
filtered_f2 = func2(dataframe)

In [105]:
#is checked how many values of Million Plus Cities have '(' character
dataframe['Million Plus Cities'].apply(lambda x : '(' in x).sum()

191

In [106]:
# after func2 application, is expected that the number of values of Million Plus Cities are equal to 0
filtered_f2['Million Plus Cities'].apply(lambda x : '(' in x).sum()

0