## Project 1 - Los Angeles Traffic Collision Analysis

In [242]:
#import modules for the analysis environment
import os
import matplotlib.pyplot as plta
import pandas as pd
import numpy as np
import requests
import time
import datetime

#read csv file
file_path = "Data/Raw/traffic-collision-data-from-2010-to-present.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Premise Description,Address,Cross Street,Location,Zip Codes,Census Tracts,Precinct Boundaries,LA Specific Plans,Council Districts,Neighborhood Councils (Certified)
0,191912240,2019-06-22T00:00:00.000,2019-06-22T00:00:00.000,1150,19,Mission,1985,997,TRAFFIC COLLISION,,...,STREET,PARTHENIA ST,TOBIAS AV,"{'latitude': '34.2279', 'human_address': '{""ad...",19730.0,144.0,417.0,,3.0,59.0
1,190315354,2019-06-22T00:00:00.000,2019-06-22T00:00:00.000,1600,3,Southwest,357,997,TRAFFIC COLLISION,,...,STREET,JEFFERSON,CATALINA,"{'latitude': '34.0255', 'human_address': '{""ad...",22724.0,691.0,913.0,7.0,14.0,32.0
2,191011360,2019-06-22T00:00:00.000,2019-06-22T00:00:00.000,650,10,West Valley,1001,997,TRAFFIC COLLISION,,...,STREET,ROSCOE,SHIRLEY,"{'latitude': '34.2204', 'human_address': '{""ad...",18909.0,101.0,1545.0,,2.0,65.0
3,192111796,2019-06-22T00:00:00.000,2019-06-22T00:00:00.000,525,21,Topanga,2145,997,TRAFFIC COLLISION,,...,STREET,VICTORY BL,SHOUP AV,"{'latitude': '34.1865', 'human_address': '{""ad...",4278.0,297.0,1473.0,,4.0,49.0
4,191512949,2019-06-22T00:00:00.000,2019-06-22T00:00:00.000,245,15,N Hollywood,1543,997,TRAFFIC COLLISION,,...,STREET,LAUREL CANYON BL,MAGNOLIA BL,"{'latitude': '34.1649', 'human_address': '{""ad...",8890.0,205.0,1332.0,17.0,5.0,39.0


## Stats of Series in the Dataset

In [4]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477377 entries, 0 to 477376
Data columns (total 24 columns):
DR Number                            477377 non-null int64
Date Reported                        477377 non-null object
Date Occurred                        477377 non-null object
Time Occurred                        477377 non-null int64
Area ID                              477377 non-null int64
Area Name                            477377 non-null object
Reporting District                   477377 non-null int64
Crime Code                           477377 non-null int64
Crime Code Description               477377 non-null object
MO Codes                             392557 non-null object
Victim Age                           399952 non-null float64
Victim Sex                           470422 non-null object
Victim Descent                       469708 non-null object
Premise Code                         477352 non-null float64
Premise Description                  477352 non-null

## Cleaning Data

In [293]:
dfclean = df.drop(columns=['DR Number', #unique value but we have index
        'Date Reported', #repetitive series
        'MO Codes', #mostly NaN values
        'Crime Code', #repetitive
        'Crime Code Description', #uniform data
       'LA Specific Plans', #unrelated data with NaN values
        'Area ID', #repetitive data
       'Premise Description',#useless data 
        'Neighborhood Councils (Certified)', #irrelevant
        'Precinct Boundaries', #irrelevant
        'Council Districts', # irrelevant
        'Census Tracts', #meaningless data
                ])
dfclean.columns = ['date','time','area','district','victim_Age','victim_sex','victim_race','premise','address',
             'cross_street','location','zip']
dfclean.head()

Unnamed: 0,date,time,area,district,victim_Age,victim_sex,victim_race,premise,address,cross_street,location,zip
0,2019-06-22T00:00:00.000,1150,Mission,1985,32.0,M,H,101.0,PARTHENIA ST,TOBIAS AV,"{'latitude': '34.2279', 'human_address': '{""ad...",19730.0
1,2019-06-22T00:00:00.000,1600,Southwest,357,27.0,M,H,101.0,JEFFERSON,CATALINA,"{'latitude': '34.0255', 'human_address': '{""ad...",22724.0
2,2019-06-22T00:00:00.000,650,West Valley,1001,20.0,F,H,101.0,ROSCOE,SHIRLEY,"{'latitude': '34.2204', 'human_address': '{""ad...",18909.0
3,2019-06-22T00:00:00.000,525,Topanga,2145,44.0,F,H,101.0,VICTORY BL,SHOUP AV,"{'latitude': '34.1865', 'human_address': '{""ad...",4278.0
4,2019-06-22T00:00:00.000,245,N Hollywood,1543,43.0,F,H,101.0,LAUREL CANYON BL,MAGNOLIA BL,"{'latitude': '34.1649', 'human_address': '{""ad...",8890.0


## victim_race

In [294]:
# country_description = {'H':'Hispanic', 'B':'Black', 'O':'Unknown', 'W':'White', 'X':'Unknown', '-':'Unknown',
#              'A':'Asian', 'K':'Korean', 'C':'Chinese', 'F':' ', 'U':' ',
#              'J':'Japanese', 'P':'Pacific Islander', 'V':'Vietnese', 'Z':' ',
#              'I':'American Indian', 'G':'Pacific Islander', 'S':' ', 'D':' ', 'L':' '}
race_description = {'H':'Hispanic', 'B':'Black', 'O':'Unknown', 'W':'White', 'X':'Unknown', '-':'Unknown',
             'A':'Asian', 'K':'Asian', 'C':'Asian', 'F':'Asian', 'U':'Pacific Islander',
             'J':'Asian', 'P':'Pacific Islander', 'V':'Asian', 'Z':'Asian',
             'I':'American Indian', 'G':'Pacific Islander', 'S':'Pacific Islander', 'D':'Asian', 'L':'Asian'}
dfclean.victim_race = dfclean.victim_race.map(race_description)
dfclean.head()

Unnamed: 0,date,time,area,district,victim_Age,victim_sex,victim_race,premise,address,cross_street,location,zip
0,2019-06-22T00:00:00.000,1150,Mission,1985,32.0,M,Hispanic,101.0,PARTHENIA ST,TOBIAS AV,"{'latitude': '34.2279', 'human_address': '{""ad...",19730.0
1,2019-06-22T00:00:00.000,1600,Southwest,357,27.0,M,Hispanic,101.0,JEFFERSON,CATALINA,"{'latitude': '34.0255', 'human_address': '{""ad...",22724.0
2,2019-06-22T00:00:00.000,650,West Valley,1001,20.0,F,Hispanic,101.0,ROSCOE,SHIRLEY,"{'latitude': '34.2204', 'human_address': '{""ad...",18909.0
3,2019-06-22T00:00:00.000,525,Topanga,2145,44.0,F,Hispanic,101.0,VICTORY BL,SHOUP AV,"{'latitude': '34.1865', 'human_address': '{""ad...",4278.0
4,2019-06-22T00:00:00.000,245,N Hollywood,1543,43.0,F,Hispanic,101.0,LAUREL CANYON BL,MAGNOLIA BL,"{'latitude': '34.1649', 'human_address': '{""ad...",8890.0


## Formatting Date and Time

In [313]:
#create bins to extract hour from 'time'
bins = []
hour = []
for i in range (0,24):
    bins.append(99*i+i-1)
    hour.append(i)
hour.pop(-1)
dfclean['hour'] = pd.cut(dfclean['time'],bins,labels=hour)
dfclean['date']=pd.to_datetime(dfclean['date'])
dfclean['month']=dfclean['date'].dt.month
dfclean['weekday'] = dfclean['date'].dt.dayofweek
weekday_description = {0:'Sunday', 1:'Monday', 2:'Tuesday',3:'Wednesday', 4:'Thursday', 5:'Friday',
                       6:'Saturday'}
dfclean.weekday=dfclean.weekday.map(weekday_description)


## Longitude and Latitude

In [315]:
import ast

In [316]:
dfclean["lat"] = dfclean["location"].apply(lambda u: float(ast.literal_eval(u)["latitude"]))
dfclean["lon"] = dfclean["location"].apply(lambda u: float(ast.literal_eval(u)["longitude"]))

KeyError: 'location'

In [317]:
dfclean.head(10)

Unnamed: 0,date,time,area,district,victim_Age,victim_sex,victim_race,premise,address,cross_street,zip,hour,weekday,lat,lon,month
0,2019-06-22,1150,Mission,1985,32.0,M,Hispanic,101.0,PARTHENIA ST,TOBIAS AV,19730.0,11,Friday,34.2279,-118.4516,6
1,2019-06-22,1600,Southwest,357,27.0,M,Hispanic,101.0,JEFFERSON,CATALINA,22724.0,16,Friday,34.0255,-118.2937,6
2,2019-06-22,650,West Valley,1001,20.0,F,Hispanic,101.0,ROSCOE,SHIRLEY,18909.0,6,Friday,34.2204,-118.5579,6
3,2019-06-22,525,Topanga,2145,44.0,F,Hispanic,101.0,VICTORY BL,SHOUP AV,4278.0,5,Friday,34.1865,-118.6146,6
4,2019-06-22,245,N Hollywood,1543,43.0,F,Hispanic,101.0,LAUREL CANYON BL,MAGNOLIA BL,8890.0,2,Friday,34.1649,-118.3965,6
5,2019-06-22,1030,Devonshire,1722,,X,Unknown,101.0,JORDAN,HIAWATHA,4284.0,10,Friday,34.2591,-118.6039,6
6,2019-06-22,109,Van Nuys,904,,M,Unknown,101.0,RAYMER ST,KESTER AV,19733.0,1,Friday,34.2138,-118.4618,6
7,2019-06-22,1320,N Hollywood,1543,40.0,M,Hispanic,101.0,LAUREL CANYON BL,ALBERS ST,8890.0,13,Friday,34.1704,-118.3965,6
8,2019-06-22,110,Van Nuys,911,,X,Unknown,101.0,ORION AV,SHERMAN WY,19734.0,1,Friday,34.2012,-118.4706,6
9,2019-06-22,830,Foothill,1675,,M,Hispanic,101.0,PENDLETON ST,GLENOAKS BL,19335.0,8,Friday,34.2364,-118.3761,6


In [318]:
dfclean = dfclean.drop(columns=['location'])
dfclean.to_csv('Data/Clean/cleaned_data.csv',index=False)