In [1]:
from bs4 import BeautifulSoup
import requests
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import pandas as pd
import os
import psycopg2
from sqlalchemy import create_engine
import string
import time

In [2]:
configs = {
    "url": 'https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/FARS/',   #Download URL
    "start_year": 2000,      #first year  for which data should be downloaded
    "national": True,        #include the national data
    "puerto_rico": True,     #include the data from Puerto Rico
    "auxiliary": False,       #include the auxiliary data
    "data_path": './data/'   #path where the data is stored 
}

In [3]:
site = requests.get(configs['url']).text
soup = BeautifulSoup(site,'html.parser')
hyperlinks = soup.find("tbody").findChildren("a")

In [4]:
hyperlink_list = []
#this for loop finds all years, for which data is available
for i in hyperlinks:
    b = i['href'][38:42]
    if b.isnumeric() and int(b)>= configs['start_year']:
        hyperlink_list.append(b)
        
hyperlink_list = list(dict.fromkeys(hyperlink_list)) # this is for avoiding duplicate years    
for i in range(0,len(hyperlink_list)):
    hyperlink_list[i] = configs['url'] + hyperlink_list[i]+"/"
# after this for loop there the hyperlink_list contains links to the download area for every year since 1975 

In [5]:
def download(link_list,path):
    #function expects a list containing the hyperlinks in string format
    #path is the path where the data from the corresponding link list should be saved, should be string format
    year_list = list(range(configs['start_year'],len(link_list)+configs['start_year'],1))
    for i in range(0,len(link_list)):
        with urlopen(link_list[i]) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall(path+str(year_list[i]).lower())

In [6]:
#this downloads all the files according to the configs dictionary
#links to the zip-files are constructed based on the hyperlink_list
#the links to the zip-files are discarded after downloading all files of one group
temp_list = []
if configs['national']:
    for i in hyperlink_list:
        temp_list.append((i+'National/'+'FARS'+i[-5:-1]+'NationalCSV.zip').replace('www','static').replace('file-downloads?p=',''))
    download(temp_list,"data/standard/national/")

temp_list = []
if configs['puerto_rico']:
    for i in hyperlink_list:
        temp_list.append((i+'Puerto%20Rico/'+'FARS'+i[-5:-1]+'PuertoRicoCSV.zip').replace('www','static').replace('file-downloads?p=',''))
    download(temp_list,"data/standard/puerto_rico/")

temp_list = []
if configs['auxiliary']:
    for i in hyperlink_list:
        temp_list.append((i+'National/'+'FARS'+i[-5:-1]+'NationalAuxiliaryCSV.zip').replace('www','static').replace('file-downloads?p=',''))
    download(temp_list,"data/auxiliary/national/")

temp_list = []
if configs['auxiliary'] and configs['puerto_rico']:
    for i in hyperlink_list:
        temp_list.append((i+'Puerto%20Rico/'+'FARS'+i[-5:-1]+'PuertoRicoAuxiliaryCSV.zip').replace('www','static').replace('file-downloads?p=',''))
    download(temp_list,"data/auxiliary/puerto_rico/")  

In [7]:
#this function adds the data to the database, frist trying to append the data to an existing table
#if the table does not exist a new table will be created
def add_to_database(dataframe,table,con_engine):
    try:
        #this will fail if there is a new column
        dataframe.to_sql(name=table, con=con_engine, if_exists = 'append', index=False)
    except:
        #first all data from the table is querried out of the database, then the concatenated data is writen 
        #into the database, overwriting any existing table with the name
        data = pd.read_sql('SELECT * FROM '+table, con_engine)
        df2 = pd.concat([data,dataframe])
        df2.to_sql(name=table, con=con_engine, if_exists = 'replace', index=False)#,method = 'psql_insert_copy')

In [8]:
def get_filepaths():
    #generates a list of paths to all files inside the data directory
    filepaths = []
    for root,dirs,files in os.walk(configs['data_path']):
        for i in files:
            filepaths.append(os.path.join(root,i))
    return filepaths

In [9]:
def timestamp(row):
    #generates a unix timestamp, works for the accidents table
    if row['DAY'] <31:
        day = row['DAY']
    else:
        day = 1
    if row['HOUR'] <24:
        hour = row['HOUR']
    else:
        hour = 0
    if row['MINUTE'] <60:
        minute = row ['MINUTE']
    else: 
        minute = 0    
    return pd.Timestamp(row['YEAR'],row['MONTH'],day,hour,minute)

In [10]:
def primarykey(pk_df,information):
    #this function creates primarykeys which are truly unique across all datarows, as the unique identifiers
    #in the underlying data are reused every year
    
    if information[1] == 'ACCIDENT':
        pk_df['TIMESTAMP']= pk_df.apply(timestamp,axis = 1)
        #this adds a new column with a timestamp to the accidents table/dataframe
        
    if 'ST_CASE' in pk_df.columns:
        front_part = information[0]*1000000
        pk_df['PRIMARY_KEY_CASE'] = pk_df['ST_CASE']+front_part
        pk_df = change_col_position(['PRIMARY_KEY_CASE'],0,pk_df)
        #ST_CASE is the unique identifier of the accidents table, it is a 5 to 6 digits integer number
        #it is added to a 10 digit integer number with 6 trailing zeroes and the front being the year 
        #in which the data was gathered
        
    return pk_df

In [11]:
def change_col_position(colnames,position,dataframe):
    #this function changes the position of a dataframe column
    #expects an iterable for colnames, an integer for the position in the dataframe and a dataframe to be changed
    collist = list(dataframe.columns)
    for i in colnames: collist.remove(i)
    for i in reversed(colnames): collist.insert(position,i)
    return dataframe[collist]


In [12]:
paths = get_filepaths() #paths is a list of paths for every file in every directory in the data directory
name_set = set()        #name_set is a set containing every unique filename   
engine = create_engine("postgresql+psycopg2://postgres:admin@localhost/NHTSA_FARS_NATIONAL") 
# engine for database connection

for i in paths:
    #adds all files to a set, also takes the different capitalizations of the files into account
    name_set.add(i.split('\\')[-1].lower())
  
    
name_list = list(name_set)
name_list.sort()
print(name_list)

for i in name_list:
    
    same_table_path_list = [x for x in paths if '\\'+i.lower() in x.lower()]
    print(same_table_path_list)
    #list comprehension to find the paths to all csv files for the elements in the name_list
    # the double backslash is important, to avoid confusion between certain tables having similar endings
    frame_list = []
    for j in same_table_path_list:
        information = []
        split_str = j.split('/')[-1].split("\\")
        information.append(int(split_str[-2]))
        information.append(split_str[-1].split(".")[0].lower())
        print(information)
        #information is a list of 2 elements, first is the year of the dataframe/file, second is the name
        
        df_each_year = pd.read_csv(j,low_memory = False,encoding='latin_1')
        #low_memory = True leads to a warning, to avoid that, it is set to false, workaround is pending
        
        df_each_year.columns = df_each_year.columns.str.lower()
        #column names are not consistent in capitlization across the different years, fixed here
        
        df_each_year = primarykey(df_each_year,information)
        frame_list.append(df_each_year)

    df = pd.concat(frame_list) #create one big dataframe out of the different small ones
    
    table_name = i.split('.')[0]# discards the csv ending
    
    
    chunk_size = -150000 #chunk size is tested with a 32GB RAM Windows System, adaptations might be needed
                         #needs to be negative to generate the correct number in the range of the for-loop
        
    if information[1]=='person' or information[1]=='vehicle': #more tables might need to be added on different machine
                                                              #automation for big table detection is work in progress
            
        #this start at the back of the table/with the most recent data, to have values in as many columns as possible
        #due to a drastic reduction in exection time compared to chronological inserts in the pandas to_sql function
        for end in range(df.shape[0],0, chunk_size):
            
            start = end + chunk_size
            if start < 0: #check for the start to avoid error in the iloc function
                start = 0
                
            df_subset = df.iloc[start:end]
            add_to_database(df_subset,information[1],engine)

            del df_subset;
            
    else:
        #this is for small and medium sized tabels
        add_to_database(df,information[1],engine)


['acc_aux.csv', 'accident.csv', 'cevent.csv', 'cevents.csv', 'crashrf.csv', 'damage.csv', 'distract.csv', 'drimpair.csv', 'driverrf.csv', 'drugs.csv', 'factor.csv', 'maneuver.csv', 'miacc.csv', 'midrvacc.csv', 'miper.csv', 'nmcrash.csv', 'nmdistract.csv', 'nmimpair.csv', 'nmprior.csv', 'parkwork.csv', 'pbtype.csv', 'per_aux.csv', 'person.csv', 'personrf.csv', 'pvehiclesf.csv', 'race.csv', 'safetyeq.csv', 'veh_aux.csv', 'vehicle.csv', 'vehiclesf.csv', 'vehnit.csv', 'vevent.csv', 'vindecode.csv', 'violatn.csv', 'vision.csv', 'vpicdecode.csv', 'vpictrailerdecode.csv', 'vsoe.csv', 'weather.csv']
['./data/auxiliary\\national\\2000\\ACC_AUX.CSV', './data/auxiliary\\national\\2001\\ACC_AUX.CSV', './data/auxiliary\\national\\2002\\ACC_AUX.CSV', './data/auxiliary\\national\\2003\\ACC_AUX.CSV', './data/auxiliary\\national\\2004\\ACC_AUX.CSV', './data/auxiliary\\national\\2005\\ACC_AUX.CSV', './data/auxiliary\\national\\2006\\ACC_AUX.CSV', './data/auxiliary\\national\\2007\\ACC_AUX.CSV', './data/

[2016, 'damage']
[2017, 'damage']
[2018, 'damage']
[2019, 'damage']
[2020, 'damage']
[2012, 'damage']
[2013, 'damage']
[2014, 'damage']
[2015, 'damage']
[2016, 'damage']
[2017, 'damage']
[2018, 'damage']
[2019, 'damage']
[2020, 'damage']
['./data/standard\\national\\2010\\DISTRACT.CSV', './data/standard\\national\\2011\\DISTRACT.CSV', './data/standard\\national\\2012\\DISTRACT.CSV', './data/standard\\national\\2013\\DISTRACT.CSV', './data/standard\\national\\2014\\DISTRACT.CSV', './data/standard\\national\\2015\\Distract.csv', './data/standard\\national\\2016\\Distract.CSV', './data/standard\\national\\2017\\Distract.CSV', './data/standard\\national\\2018\\Distract.csv', './data/standard\\national\\2019\\Distract.CSV', './data/standard\\national\\2020\\Distract.CSV', './data/standard\\puerto_rico\\2010\\DISTRACT.csv', './data/standard\\puerto_rico\\2011\\DISTRACT.csv', './data/standard\\puerto_rico\\2012\\DISTRACT.csv', './data/standard\\puerto_rico\\2013\\DISTRACT.csv', './data/standa

[2009, 'miacc']
[2010, 'miacc']
[2011, 'miacc']
[2012, 'miacc']
[2013, 'miacc']
[2014, 'miacc']
[2015, 'miacc']
[2016, 'miacc']
[2017, 'miacc']
[2018, 'miacc']
[2019, 'miacc']
[2020, 'miacc']
[2000, 'miacc']
[2001, 'miacc']
[2002, 'miacc']
[2003, 'miacc']
[2004, 'miacc']
[2005, 'miacc']
[2006, 'miacc']
[2007, 'miacc']
[2008, 'miacc']
[2009, 'miacc']
[2010, 'miacc']
[2011, 'miacc']
[2012, 'miacc']
[2013, 'miacc']
[2014, 'miacc']
[2015, 'miacc']
[2016, 'miacc']
[2017, 'miacc']
[2018, 'miacc']
[2019, 'miacc']
[2020, 'miacc']
['./data/standard\\national\\2000\\MIDRVACC.CSV', './data/standard\\national\\2001\\MIDRVACC.CSV', './data/standard\\national\\2002\\MIDRVACC.CSV', './data/standard\\national\\2003\\MIDRVACC.CSV', './data/standard\\national\\2004\\MIDRVACC.CSV', './data/standard\\national\\2005\\MIDRVACC.CSV', './data/standard\\national\\2006\\MIDRVACC.CSV', './data/standard\\national\\2007\\MIDRVACC.CSV', './data/standard\\national\\2008\\MIDRVACC.CSV', './data/standard\\national\\20

['./data/standard\\national\\2010\\NMPRIOR.CSV', './data/standard\\national\\2011\\NMPRIOR.CSV', './data/standard\\national\\2012\\NMPRIOR.CSV', './data/standard\\national\\2013\\NMPRIOR.CSV', './data/standard\\national\\2014\\NMPRIOR.CSV', './data/standard\\national\\2015\\NMPrior.csv', './data/standard\\national\\2016\\NMPrior.CSV', './data/standard\\national\\2017\\NMPrior.CSV', './data/standard\\national\\2018\\NMPrior.csv', './data/standard\\national\\2019\\NMPrior.CSV', './data/standard\\national\\2020\\NMPrior.CSV', './data/standard\\puerto_rico\\2010\\NMPRIOR.csv', './data/standard\\puerto_rico\\2011\\NMPRIOR.csv', './data/standard\\puerto_rico\\2012\\NMPRIOR.csv', './data/standard\\puerto_rico\\2013\\NMPRIOR.csv', './data/standard\\puerto_rico\\2014\\NMPRIOR.csv', './data/standard\\puerto_rico\\2015\\NMPrior.csv', './data/standard\\puerto_rico\\2016\\NMPrior.CSV', './data/standard\\puerto_rico\\2017\\NMPrior.CSV', './data/standard\\puerto_rico\\2018\\NMPrior.csv', './data/stan

[2001, 'person']
[2002, 'person']
[2003, 'person']
[2004, 'person']
[2005, 'person']
[2006, 'person']
[2007, 'person']
[2008, 'person']
[2009, 'person']
[2010, 'person']
[2011, 'person']
[2012, 'person']
[2013, 'person']
[2014, 'person']
[2015, 'person']
[2016, 'person']
[2017, 'person']
[2018, 'person']
[2019, 'person']
[2020, 'person']
[2000, 'person']
[2001, 'person']
[2002, 'person']
[2003, 'person']
[2004, 'person']
[2005, 'person']
[2006, 'person']
[2007, 'person']
[2008, 'person']
[2009, 'person']
[2010, 'person']
[2011, 'person']
[2012, 'person']
[2013, 'person']
[2014, 'person']
[2015, 'person']
[2016, 'person']
[2017, 'person']
[2018, 'person']
[2019, 'person']
[2020, 'person']
['./data/standard\\national\\2020\\PERSONRF.CSV', './data/standard\\puerto_rico\\2020\\PERSONRF.CSV']
[2020, 'personrf']
[2020, 'personrf']
['./data/standard\\national\\2020\\pvehiclesf.CSV', './data/standard\\puerto_rico\\2020\\pvehiclesf.CSV']
[2020, 'pvehiclesf']
[2020, 'pvehiclesf']
['./data/standa

[2013, 'vehicle']
[2014, 'vehicle']
[2015, 'vehicle']
[2016, 'vehicle']
[2017, 'vehicle']
[2018, 'vehicle']
[2019, 'vehicle']
[2020, 'vehicle']
['./data/standard\\national\\2020\\vehiclesf.CSV', './data/standard\\puerto_rico\\2020\\vehiclesf.CSV']
[2020, 'vehiclesf']
[2020, 'vehiclesf']
['./data/standard\\national\\2005\\VEHNIT.CSV', './data/standard\\national\\2006\\VEHNIT.CSV', './data/standard\\national\\2007\\VEHNIT.CSV', './data/standard\\national\\2008\\VEHNIT.CSV', './data/standard\\national\\2009\\VEHNIT.CSV', './data/standard\\puerto_rico\\2005\\VEHNIT.csv', './data/standard\\puerto_rico\\2006\\VEHNIT.csv', './data/standard\\puerto_rico\\2007\\VEHNIT.csv', './data/standard\\puerto_rico\\2008\\VEHNIT.csv', './data/standard\\puerto_rico\\2009\\VEHNIT.csv']
[2005, 'vehnit']
[2006, 'vehnit']
[2007, 'vehnit']
[2008, 'vehnit']
[2009, 'vehnit']
[2005, 'vehnit']
[2006, 'vehnit']
[2007, 'vehnit']
[2008, 'vehnit']
[2009, 'vehnit']
['./data/standard\\national\\2010\\VEVENT.CSV', './data/

[2015, 'vsoe']
[2016, 'vsoe']
[2017, 'vsoe']
[2018, 'vsoe']
[2019, 'vsoe']
[2020, 'vsoe']
[2010, 'vsoe']
[2011, 'vsoe']
[2012, 'vsoe']
[2013, 'vsoe']
[2014, 'vsoe']
[2015, 'vsoe']
[2016, 'vsoe']
[2017, 'vsoe']
[2018, 'vsoe']
[2019, 'vsoe']
[2020, 'vsoe']
['./data/standard\\national\\2020\\weather.CSV', './data/standard\\puerto_rico\\2020\\weather.CSV']
[2020, 'weather']
[2020, 'weather']


In [13]:
#paths = get_filepaths() 

In [14]:
#paths[1].split('\\')[-1]