## **Imports**

In [117]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, f1_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os 
from google.colab import drive
from imblearn.over_sampling import SMOTE
from datetime import datetime

## **Helper functions**

In [127]:
def reduce_memory_usage(df):
    """
        This function reduces the memory usage of the dataframes 
    """
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [128]:
def outliers_handeling(df):

  numerical_columns = [col for col in df.columns if df[col].dtype != 'object']

  for col in numerical_columns:
    percentiles = df[col].quantile([0.05,0.95]).values
    df[col][df[col] <= percentiles[0]] = percentiles[0]
    df[col][df[col] >= percentiles[1]] = percentiles[1]

  return df

In [129]:
def normalize_data(df, train=True):
  numerical_columns = [col for col in df.columns if df[col].dtype != 'object']
  if train == True:
    for col in numerical_columns:
      df[col] = norm_scale.fit_transform(np.array(df[col]).reshape(-1,1))
    return df
  else: 
    for col in numerical_columns:
      df[col] = norm_scale.transform(np.array(df[col]).reshape(-1,1))
    return df

In [130]:
def standarize_data(df, train=True):
  numerical_columns = [col for col in df.columns if df[col].dtype != 'object']
  if train == True:
    for col in numerical_columns:
      df[col] = stand_scale.fit_transform(np.array(df[col]).reshape(-1,1))
    return df
  else : 
    for col in numerical_columns:
      df[col] = stand_scale.transform(np.array(df[col]).reshape(-1,1))
    return df

In [131]:
def label_encoding(df,column):
    df[column] = LabelEncoder().fit_transform(df[column])
    return df

In [132]:
def categorical_encoding(df): 
  categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
  for col in categorical_columns:
    cagegorized_age_columns = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, cagegorized_age_columns], axis=1)
    df = df.drop([col], axis=1)
  return df

In [133]:
def time_convert(col):
  data = {"Hours": [], "Minuts": []}
  for row in df['Time (24hr)']:
    if row < 1000:
      row = str(row)
      data['Hours'].append(row[0])
      data['Minuts'].append(row[1:-2])
    else:
      row = str(row)
      data['Hours'].append(row[0:2])
      data['Minuts'].append(row[2:-2])
  data['Hours']= pd.to_numeric(data['Hours'])
  data['Minuts']= pd.to_numeric(data['Minuts'])
  return pd.DataFrame(data)

In [134]:
def handeling_columns(df):
  df.dropna(subset=["Weather Conditions"], inplace= True)
  df.at[2526,'Road Surface']='Dry'
  df['Type of Vehicle'] = df['Type of Vehicle'].replace(['Other Vehicle'], 'Car')
  df.drop(columns= "Time (24hr)", inplace= True)

In [135]:
def date_handelling(df):
  for idx,row in enumerate(df['Accident Date']):
    df['Accident Date'][idx]=row.strftime("%Y-%B-%d")
  return df

## **Reading the data**

In [136]:
df = pd.read_excel("/content/Road Accidents.xlsx", na_values=["Other","Unknown"])
df_time =df['Accident Date']
df.drop(columns= "Accident Date", inplace=True)

## **Applying helper functions**

In [137]:
df = reduce_memory_usage(df)

Memory usage of dataframe is 0.28 MB
Memory usage after optimization is: 0.20 MB
Decreased by 26.8%


In [138]:
df = pd.concat([df, df_time], axis= 1)

In [139]:
df = outliers_handeling(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df[col] <= percentiles[0]] = percentiles[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df[col] >= percentiles[1]] = percentiles[1]


In [140]:
df = label_encoding(df, "Casualty Severity")

In [141]:
time = time_convert(df)
df = pd.concat([df,time],axis= 1)

In [142]:
handeling_columns(df)

In [143]:
df= categorical_encoding(df)

In [144]:
df= date_handelling(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Accident Date'][idx]=row.strftime("%Y-%B-%d")


# **EDA** temp

In [70]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [145]:
df.to_csv('/content/gdrive/MyDrive/the projects/Acc/CleanACC.csv', index= False)