# Data Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('KSI.csv')

In [None]:
print(df.shape)

In [None]:
print(df.head())

In [None]:
df.info()

In [None]:
print(df.describe()) 

In [None]:
print(df.isnull().sum())

In [None]:
#Plot histogram
df.hist(bins=50,figsize=(20,20))
plt.show()

#Plot correlation matrix
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True,cmap='RdYlGn')
plt.show()

In [None]:
# Double check if we need to use these columns to train the model based on the requirement
bool_cols = [df.columns[col] for col in range(38,51)]

#Fill missing values with 'No' in boolean columns
df[bool_cols] = df[bool_cols].fillna('No')

In [None]:
# Drop columns having more than 80% missing values
missing_percentages = (df.isnull().sum() / len(df)) * 100
columns_to_drop = missing_percentages[missing_percentages > 80].index
df = df.drop(columns=columns_to_drop)

In [None]:
# From correlation matrix, we can see that 'X' and 'LONGITUDE', 'Y' and 'LATITUDE' are highly correlated. So, we can drop one of them.
columns_to_drop = ['X', 'Y']
df = df.drop(columns=columns_to_drop)

In [None]:
# Drop columns which may not be useful for analysis
columns_to_drop = ['ObjectId', 'INDEX_', 'ACCNUM', 'INITDIR', 'STREET1', 'STREET2', 'DISTRICT', 'WARDNUM', 'DIVISION', 'HOOD_158', 'HOOD_140', 'INJURY']
# 'INJURY' feature which might be dropped depends on the accuracy score of model
df = df.drop(columns=columns_to_drop)

In [None]:
# In 'NEIGHBOURHOOD_140', the values contain number inside brackets. We can remove the number inside brackets and remove space before and after the string.
df['NEIGHBOURHOOD_140'] = df['NEIGHBOURHOOD_140'].str.replace(r"\(.*\)","").str.strip()
df['NEIGHBOURHOOD_140']

In [None]:
# Convert 'property' to 'non fatal'
df['ACCLASS'] = df['ACCLASS'].str.replace("Property Damage Only","Non-Fatal")

## Latest Change - Pham

In [None]:
#categorical columns which have <3% missing values, we can drop them. The number is not remarkable, it won't affect the accuracy, it is just below 3%
df.isnull().sum()
cat_cols =  df.select_dtypes(include='object')
missing_percentages = cat_cols.isnull().sum()/len(df) * 100
cat_col_val_drop = missing_percentages[missing_percentages <= 3].index
cat_col_val_drop
df = df.dropna(subset=cat_col_val_drop)
df.isnull().sum()

In [None]:
# Convert 'TIME' to 'AM_PM', we can use this feature to check the percentage of accidents happened in day and night.
#! Note: This feature is only used for data exploration, it won't be used for training the model.
df['AM_PM'] = df['TIME'].apply(lambda x: 'AM' if x < 1200 else 'PM')

In [None]:
# Extract 'DATE' to 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK'

In [None]:
#null values to Nan
df = df.fillna(value=np.nan)