# Import Dependencies

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import statsmodels.api as sm
import seaborn as sn
import json
from scipy.stats import sem,chisquare
from scipy.stats import linregress
from matplotlib import cm

# Import Accident Data

Source: https://www.kaggle.com/sobhanmoosavi/us-accidents/data

In [4]:
filepath = "US_Accidents_June20.csv"
df = pd.read_csv(filepath, low_memory=False)

df.head(5)

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


# Find Null Values

In [5]:
count_nan_in_df = df.isnull().sum()
#print (count_nan_in_df)
count_nan_in_df

ID                             0
Source                         0
TMC                      1034799
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  2478818
End_Lng                  2478818
Distance(mi)                   0
Description                    1
Number                   2262864
Street                         0
Side                           0
City                         112
County                         0
State                          0
Zipcode                     1069
Country                        0
Timezone                    3880
Airport_Code                6758
Weather_Timestamp          43323
Temperature(F)             65732
Wind_Chill(F)            1868249
Humidity(%)                69687
Pressure(in)               55882
Visibility(mi)             75856
Wind_Direction             58874
Wind_Speed(mph)           454609
Precipitat

# Convert Date Columns to Date Type

In [6]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='%Y-%m-%dT%H:%M:%S')
df['End_Time'] = pd.to_datetime(df['End_Time'], format='%Y-%m-%dT%H:%M:%S')

In [7]:
df['Start_Time'].max()

Timestamp('2020-06-30 23:18:09')

In [8]:
df['Start_Time'].min()

Timestamp('2016-02-08 00:37:08')

## Confirm Time Range of Data

# Create New Dataframe with a Subset of Columns

In [9]:
new_df = df[['ID', 'Source', 'Severity','Start_Time','End_Time','Start_Lat', 'Start_Lng','Street','City','State', 'Zipcode', 'Civil_Twilight']].copy()
new_df.head(5)

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Street,City,State,Zipcode,Civil_Twilight
0,A-1,MapQuest,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,I-70 E,Dayton,OH,45424,Night
1,A-2,MapQuest,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,Brice Rd,Reynoldsburg,OH,43068-3402,Night
2,A-3,MapQuest,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,State Route 32,Williamsburg,OH,45176,Night
3,A-4,MapQuest,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,I-75 S,Dayton,OH,45417,Day
4,A-5,MapQuest,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,Miamisburg Centerville Rd,Dayton,OH,45459,Day


In [10]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3513617 entries, 0 to 3513616
Data columns (total 12 columns):
 #   Column          Dtype         
---  ------          -----         
 0   ID              object        
 1   Source          object        
 2   Severity        int64         
 3   Start_Time      datetime64[ns]
 4   End_Time        datetime64[ns]
 5   Start_Lat       float64       
 6   Start_Lng       float64       
 7   Street          object        
 8   City            object        
 9   State           object        
 10  Zipcode         object        
 11  Civil_Twilight  object        
dtypes: datetime64[ns](2), float64(2), int64(1), object(7)
memory usage: 321.7+ MB


# Filter for Data over the most recent 6 months

In [11]:
mask = (new_df['Start_Time'] > '2019-03-01 00:00:00') &(new_df['Start_Time'] < '2019-07-31 00:00:00')

In [12]:
last_6_df = new_df.loc[mask]
last_6_df.sort_values('Start_Time', ascending=False, inplace=True)
last_6_df.reset_index(drop=True, inplace=True)
last_6_df.tail(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Street,City,State,Zipcode,Civil_Twilight
347272,A-1320029,MapQuest,2,2019-03-01 00:23:48,2019-03-01 00:53:32,38.10321,-122.514114,CA-37 E,Novato,CA,94945,Night
347273,A-1320028,MapQuest,2,2019-03-01 00:19:56,2019-03-01 00:49:46,38.363258,-122.725243,Wilford Ln,Santa Rosa,CA,95407-8429,Night
347274,A-3174133,Bing,4,2019-03-01 00:13:28,2019-03-01 05:12:08,43.26127,-122.42839,N Umpqua Hwy,Glide,OR,97443,Night
347275,A-3174134,Bing,2,2019-03-01 00:10:17,2019-03-01 04:23:49,42.775101,-123.292037,I-5 S,Glendale,OR,97442,Night
347276,A-3174132,Bing,2,2019-03-01 00:08:44,2019-03-01 04:08:44,42.599005,-123.383793,I-5 S,Grants Pass,OR,97526,Night


In [13]:
last_6_df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Street,City,State,Zipcode,Civil_Twilight
0,A-3503015,Bing,2,2019-07-30 23:57:59,2019-07-31 00:25:32,33.82165,-117.87627,CA-57 S,Anaheim,CA,92806,Night
1,A-3502850,Bing,2,2019-07-30 23:50:26,2019-07-31 03:50:26,44.615142,-117.491259,I-84 E,Huntington,OR,97907,Night
2,A-1064323,MapQuest,2,2019-07-30 23:49:24,2019-07-31 03:20:58,34.281811,-118.716583,Ronald Reagan Fwy W,Simi Valley,CA,93063,Night
3,A-1060571,MapQuest,2,2019-07-30 23:47:37,2019-07-31 00:37:00,40.790619,-73.202011,E Suffolk Ave,Central Islip,NY,11722-2340,Night
4,A-1060570,MapQuest,2,2019-07-30 23:45:31,2019-07-31 01:10:12,40.651737,-73.865746,Belt Pkwy W,Brooklyn,NY,11239,Night


# Export to CSV

In [14]:
# last_6_df.to_csv('2019data.csv')