# Import Dependencies

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Import Accident Data

Source: https://www.kaggle.com/sobhanmoosavi/us-accidents/data

In [2]:
filepath = "US_Accidents_June20.csv"
df = pd.read_csv(filepath, low_memory=False)

df.head(5)

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


# Find Null Values

In [3]:
count_nan_in_df = df.isnull().sum()
#print (count_nan_in_df)
count_nan_in_df

ID                             0
Source                         0
TMC                      1034799
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  2478818
End_Lng                  2478818
Distance(mi)                   0
Description                    1
Number                   2262864
Street                         0
Side                           0
City                         112
County                         0
State                          0
Zipcode                     1069
Country                        0
Timezone                    3880
Airport_Code                6758
Weather_Timestamp          43323
Temperature(F)             65732
Wind_Chill(F)            1868249
Humidity(%)                69687
Pressure(in)               55882
Visibility(mi)             75856
Wind_Direction             58874
Wind_Speed(mph)           454609
Precipitat

# Convert Date Columns to Date Type

In [4]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='%Y-%m-%dT%H:%M:%S')
df['End_Time'] = pd.to_datetime(df['End_Time'], format='%Y-%m-%dT%H:%M:%S')

In [5]:
df['Start_Time'].max()

Timestamp('2020-06-30 23:18:09')

In [6]:
df['Start_Time'].min()

Timestamp('2016-02-08 00:37:08')

## Confirm Time Range of Data

# Create New Dataframe with a Subset of Columns

In [7]:
new_df = df[['ID', 'Severity','Start_Time','End_Time','Start_Lat', 'Start_Lng','Street','City','State', 'Zipcode', 'Temperature(F)', 'Humidity(%)', 'Civil_Twilight']].copy()
new_df.head(5)

Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Street,City,State,Zipcode,Temperature(F),Humidity(%),Civil_Twilight
0,A-1,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,I-70 E,Dayton,OH,45424,36.9,91.0,Night
1,A-2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,Brice Rd,Reynoldsburg,OH,43068-3402,37.9,100.0,Night
2,A-3,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,State Route 32,Williamsburg,OH,45176,36.0,100.0,Night
3,A-4,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,I-75 S,Dayton,OH,45417,35.1,96.0,Day
4,A-5,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,Miamisburg Centerville Rd,Dayton,OH,45459,36.0,89.0,Day


In [8]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3513617 entries, 0 to 3513616
Data columns (total 13 columns):
 #   Column          Dtype         
---  ------          -----         
 0   ID              object        
 1   Severity        int64         
 2   Start_Time      datetime64[ns]
 3   End_Time        datetime64[ns]
 4   Start_Lat       float64       
 5   Start_Lng       float64       
 6   Street          object        
 7   City            object        
 8   State           object        
 9   Zipcode         object        
 10  Temperature(F)  float64       
 11  Humidity(%)     float64       
 12  Civil_Twilight  object        
dtypes: datetime64[ns](2), float64(4), int64(1), object(6)
memory usage: 348.5+ MB


In [14]:
new_df.describe()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Temperature(F),Humidity(%)
count,3513617.0,3513617.0,3513617.0,3447885.0,3443930.0
mean,2.339929,36.54195,-95.79151,61.93512,65.11427
std,0.5521935,4.88352,17.36877,18.62106,22.75558
min,1.0,24.55527,-124.6238,-89.0,1.0
25%,2.0,33.63784,-117.4418,50.0,48.0
50%,2.0,35.91687,-91.02601,64.0,67.0
75%,3.0,40.32217,-80.93299,75.9,84.0
max,4.0,49.0022,-67.11317,170.6,100.0


# Filter for Data over the most recent 6 months

In [9]:
mask = new_df['Start_Time'] > '2019-06-01 00:00:00'

In [10]:
last_12_df = new_df.loc[mask]
last_12_df.sort_values('Start_Time', ascending=False, inplace=True)
last_12_df.reset_index(drop=True, inplace=True)
last_12_df.tail(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Street,City,State,Zipcode,Temperature(F),Humidity(%),Civil_Twilight
1122814,A-3482074,3,2019-06-01 00:19:32,2019-06-01 00:48:50,47.040919,-123.067791,State Highway 8,Olympia,WA,98512,53.0,86.0,Night
1122815,A-3482075,2,2019-06-01 00:18:57,2019-06-01 04:18:57,45.424528,-120.770084,Highway 97,Moro,OR,97039,70.0,61.0,Night
1122816,A-1146563,2,2019-06-01 00:11:45,2019-06-01 01:11:29,43.000874,-83.653938,S Dort Hwy,Flint,MI,48507-5208,67.0,70.0,Night
1122817,A-1149357,2,2019-06-01 00:09:08,2019-06-01 00:38:42,47.467289,-122.217781,WA-167 N,Renton,WA,98055,62.0,78.0,Night
1122818,A-1146760,2,2019-06-01 00:00:41,2019-06-01 01:15:17,40.731934,-74.221436,Garden State Pkwy N,Irvington,NJ,07111,68.0,70.0,Night


# Export to CSV

In [11]:
last_12_df.to_csv(r'Leaflet Viz/static/data/last_12_months.csv')

# Randomly Sample DataSet (10% of Values)

In [19]:
sample_df = new_df.sample(n=351361, replace='False')
sample_df.head(5)

Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Street,City,State,Zipcode,Temperature(F),Humidity(%),Civil_Twilight
1142180,A-1142206,2,2019-06-27 23:00:08,2019-06-27 23:29:51,29.754919,-95.334854,Navigation Blvd,Houston,TX,77003-1724,81.0,82.0,Night
3466504,A-3466668,4,2019-05-30 04:24:02,2019-05-30 04:52:38,34.50645,-117.89666,Pearblossom Hwy,Pearblossom,CA,93553-3007,54.0,53.0,Night
909231,A-909257,3,2019-11-25 07:43:27,2019-11-25 08:43:14,33.081825,-80.207222,I-26 E,Summerville,SC,29483,37.0,93.0,Day
631310,A-631321,2,2020-04-24 15:34:23,2020-04-24 18:47:12,36.021774,-95.88649,E 98th St,Tulsa,OK,74133,80.0,38.0,Day
2470492,A-2470533,3,2017-08-27 04:38:33,2017-08-27 05:07:02,38.069717,-122.226913,Carquinez Brg E,Vallejo,CA,94591,57.0,89.0,Night


In [20]:
sample_df.to_csv(r'Leaflet Viz/static/data/sample_10_percent.csv')