# Notebook to Preprocess the Location & Basic Variables

The notebook would be used as a module to preprocess variables related to location and basic categories of the accident data. Namely, we would be processing the following variables in the dataset.

1. Basic: 'ID', 'Severity', 'Start_Time', 'End_Time', 'Distance(mi)', 'Description'
2. Location: 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Number', 'Street', 'Side', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code'

In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import collections

from skimage import io
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from numpy.linalg import inv
from numpy.linalg import det
from numpy import linalg as LA
from scipy.spatial import distance
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix

In [6]:
accident_df = pd.read_csv('../data/raw/accident_data.csv')
basic_location = ['ID', 'Severity', 'Start_Time', 'End_Time', 'Distance(mi)', 'Description', 'Start_Lat', 
                  'Start_Lng', 'End_Lat', 'End_Lng', 'Number', 'Street', 'Side', 'City', 'County', 
                  'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code']
accident_bl_df = accident_df[basic_location]

In [7]:
accident_bl_df = accident_bl_df.replace(r'^\s*$', np.nan, regex=True)
100*accident_bl_df.isna().sum() / accident_bl_df.shape[0]


ID               0.000000
Severity         0.000000
Start_Time       0.000000
End_Time         0.000000
Distance(mi)     0.000000
Description      0.000000
Start_Lat        0.000000
Start_Lng        0.000000
End_Lat          0.000000
End_Lng          0.000000
Number          70.764451
Street           0.000000
Side             0.000000
City             0.000000
County           0.000000
State            0.000000
Zipcode          0.000000
Country          0.000000
Timezone         0.000000
Airport_Code     0.000000
dtype: float64

In [8]:
# Dropping the following columns:
# 1. ID: It'll be unique for all the rows and hence not required.
# 2. Country: Since we are doing the analysis for the US
# 3. Dropping state as it's unique for each city that we have chosen
# 4. Number: This column has too many missing values and hence dropping it
# 5. Zipcode: We already have enough location information
# 6. Airport_Code: This carries the code for the weather station, not really needed.

accident_bl_df = accident_bl_df.drop(['ID', 'Country', 'State', 'Number', 'Zipcode', 'Airport_Code'], axis=1)

# Convert start and end time to datetime type. This would help in the feature extraction step
accident_bl_df['Start_Time'] = pd.to_datetime(accident_bl_df['Start_Time'], errors='coerce')
accident_bl_df['End_Time'] = pd.to_datetime(accident_bl_df['End_Time'], errors='coerce')

In [9]:
accident_bl_df

Unnamed: 0,Severity,Start_Time,End_Time,Distance(mi),Description,Start_Lat,Start_Lng,End_Lat,End_Lng,Street,Side,City,County,Timezone
0,2,2016-03-22 19:36:00,2016-03-23 01:36:00,0.000,At Avenue 43 - Accident.,34.092560,-118.206220,34.092560,-118.206220,CA-110 N,R,Los Angeles,Los Angeles,US/Pacific
1,2,2016-03-22 20:59:00,2016-03-23 02:59:00,0.099,At Century Blvd - Accident.,33.948190,-118.279730,33.946760,-118.279750,I-110 S,R,Los Angeles,Los Angeles,US/Pacific
2,3,2016-03-23 07:59:00,2016-03-23 13:59:00,0.136,At Whittier Blvd/Olympic Blvd - Accident.,34.023300,-118.172880,34.021380,-118.173390,Long Beach Fwy S,R,Los Angeles,Los Angeles,US/Pacific
3,2,2016-03-23 11:50:00,2016-03-23 17:50:00,0.257,At Colorado St - Accident.,34.144700,-118.278650,34.141040,-118.277840,Golden State Fwy S,R,Los Angeles,Los Angeles,US/Pacific
4,2,2016-03-23 12:16:00,2016-03-23 18:16:00,0.054,At I-5/Golden State Fwy - Accident. Left lane ...,34.099140,-118.251853,34.099817,-118.251396,CA-2 S,R,Los Angeles,Los Angeles,US/Pacific
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61150,2,2019-12-10 16:08:00,2019-12-10 18:01:00,0.000,At CA-134/Ventura Fwy/W Doran St - Accident.,34.153767,-118.275389,34.153767,-118.275389,San Fernando Rd,R,Los Angeles,Los Angeles,US/Pacific
61151,2,2019-12-10 16:21:00,2019-12-10 17:14:00,0.000,At I-110/Harbor Fwy/S Hill St - Accident.,34.011075,-118.281251,34.011075,-118.281251,Harbor Fwy S,R,Los Angeles,Los Angeles,US/Pacific
61152,2,2019-12-10 16:28:00,2019-12-10 17:01:00,0.000,At Forest Lawn Dr - Accident.,34.154756,-118.316479,34.154756,-118.316479,CA-134 E,R,Los Angeles,Los Angeles,US/Pacific
61153,2,2019-12-10 16:53:00,2019-12-10 17:48:00,0.000,At Venice Blvd - Accident.,34.039993,-118.291563,34.039993,-118.291563,S Vermont Ave,L,Los Angeles,Los Angeles,US/Pacific
