### Sourcing and Extracting the data

In [1]:
import pandas as pd

In [4]:
def extract_data(filepath: object) -> object:
    """
        input; filepath; str, file path to CSV data
        output; pandas dataframe, extracted from CSV data
    """
    try:
        df = pd.read_csv(filepath)
    except FileNotFoundError:
        print("File Not Found;")
    except pd.errors.EmptyDataError: 
        print("Error: The file is empty.")
    except Exception as e:
        print(f"Error: {e}")
    
    return df

In [5]:
df_crashes = extract_data("../datasets/traffic_crashes.csv")
print("DF Crashes: ", df_crashes.shape)

DF Crashes:  (1000, 49)


### Transformation and data cleansing

In [6]:
def transform_data(df: object) -> object:
    """
    input: df; pandas dataframe, extracted data
    output: df; pandas dataframe, transformed data
    """
    # Dropping duplicate rows
    df = df.drop_duplicates()

    # Dealing with missing values by replacing them by the mean of the column.
    df.fillna(df.mean(), inplace=True)

    # Some columns needs to be converted to appropriate data types
    try:
        df['CRASH_DATE'] = pd.to_datetime(df['CRASH_DATE'], format="%m%d%Y")
    except:
        pass
    try:
        df['POSTED_SPEED_LIMIT'] = df['POSTED_SPEED_LIMIT'].astype('int32')
    except:
        pass

    return df