# Grand Circus Final Project
### Car Crash and Safety Data Comparisons/Evaluations

This project aims to compare safety ratings from crash tests to actual data of fatal crashes. The use of fatal crash data is better suited for hard crashes where occupant life is and was in danger, providing more relevant data entries compared to fender benders or other minimal 'traffic incidents'. This analysis could be useful for car buyers, car manufactureres, government testers, and insurance companies.

## Extraction
To start the ETA process, data but be extracted and placed into usable structures. To do this, we will be importing the data from the api(s) and any other flat file sources.

In [3]:
import pandas as pd
import requests
import json
import tqdm

# Begin pulling make names and ID's for internal use
# Definitions endpoint query
make_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/definitions/GetVariableAttributes?variable=make&caseYear=2021&format=json"

# Get response
response = requests.get(make_url)
# Turn response into json
data = response.json()

In [4]:
#Drill down json to list of dictionary
results = data['Results'][0]

In [5]:
# split data into lists
id_list = []
name_list = []
for entry in results:
    id_list.append(int(entry['ID']))
    name_list.append(entry['TEXT'])

# Make columns dictionary based on lists
data = {'MakeID': id_list, 'Name': name_list}

# Create df using dictionary
manufacturer_df = pd.DataFrame(data)

# Sort by Id instead of name
manufacturer_df = manufacturer_df.sort_values(by=['MakeID'])
manufacturer_df.head(20)

Unnamed: 0,MakeID,Name
3,1,American Motors
38,2,Jeep / Kaiser-Jeep / Willys- Jeep
2,3,AM General
13,6,Chrysler
18,7,Dodge
32,8,Imperial
64,9,Plymouth
20,10,Eagle
23,12,Ford
45,13,Lincoln


### Fetching Model IDs

In [7]:
import time
all_models = []
for make_ID in manufacturer_df['MakeID']:
    model_url = f'https://crashviewer.nhtsa.dot.gov/CrashAPI/definitions/GetVariableAttributesForModel?variable=model&caseYear=2021&make={make_ID}&format=json'
    response = requests.get(model_url)
    model_data = response.json()
    
    results_model = model_data.get('Results') 

    time.sleep(1)
    for model in results_model:
        all_models.append({
            'MakeID': make_ID,
            'Models': model[0:]
        })
# Drill down into JSON
drill_down = all_models[0]['Models']
drill_down

[{'ID': 3, 'MODELNAME': 'Ambassador', 'Make': None},
 {'ID': 5, 'MODELNAME': 'AMX', 'Make': None},
 {'ID': 9, 'MODELNAME': 'Eagle', 'Make': None},
 {'ID': 10, 'MODELNAME': 'Eagle SX-4', 'Make': None},
 {'ID': 7, 'MODELNAME': 'Hornet/Concord', 'Make': None},
 {'ID': 6, 'MODELNAME': 'Javelin', 'Make': None},
 {'ID': 398, 'MODELNAME': 'Other (automobile)', 'Make': None},
 {'ID': 4, 'MODELNAME': 'Pacer', 'Make': None},
 {'ID': 1, 'MODELNAME': 'Rambler/American', 'Make': None},
 {'ID': 2, 'MODELNAME': 'Rebel/Matador/Marlin', 'Make': None},
 {'ID': 8, 'MODELNAME': 'Spirit/Gremlin', 'Make': None},
 {'ID': 399, 'MODELNAME': 'Unknown (automobile)', 'Make': None}]

In [8]:
models_df = pd.DataFrame(all_models).sort_values(by='MakeID')
models_df

Unnamed: 0,MakeID,Models
0,1,"[{'ID': 3, 'MODELNAME': 'Ambassador', 'Make': ..."
1,2,"[{'ID': 404, 'MODELNAME': 'Cherokee (1984-on) ..."
2,3,"[{'ID': 983, 'MODELNAME': 'Bus: Rear engine, F..."
3,6,"[{'ID': 18, 'MODELNAME': '200', 'Make': None},..."
4,7,"[{'ID': 12, 'MODELNAME': '400', 'Make': None},..."
...,...,...
80,93,"[{'ID': 981, 'MODELNAME': 'Bus**: Conventional..."
81,94,"[{'ID': 981, 'MODELNAME': 'Bus**: Conventional..."
82,97,"[{'ID': 997, 'MODELNAME': 'Not Reported', 'Mak..."
83,98,"[{'ID': 701, 'MODELNAME': '0-50cc', 'Make': No..."


In [9]:
# Merge manufacturer_df & models_df
merged_df = pd.merge(manufacturer_df, models_df, on="MakeID", how="left")
merged_df = merged_df.sort_values(by='MakeID')

In [10]:
# Explode the Models column to separate rows
exploded_df = merged_df.explode('Models')
exploded_df

Unnamed: 0,MakeID,Name,Models
0,1,American Motors,"{'ID': 3, 'MODELNAME': 'Ambassador', 'Make': N..."
0,1,American Motors,"{'ID': 5, 'MODELNAME': 'AMX', 'Make': None}"
0,1,American Motors,"{'ID': 9, 'MODELNAME': 'Eagle', 'Make': None}"
0,1,American Motors,"{'ID': 10, 'MODELNAME': 'Eagle SX-4', 'Make': ..."
0,1,American Motors,"{'ID': 7, 'MODELNAME': 'Hornet/Concord', 'Make..."
...,...,...,...
84,99,Unknown Make,"{'ID': 499, 'MODELNAME': 'Unknown (light truck..."
84,99,Unknown Make,"{'ID': 598, 'MODELNAME': 'Unknown (LSG/NGV)', ..."
84,99,Unknown Make,"{'ID': 599, 'MODELNAME': 'Unknown (LSV/NGV)', ..."
84,99,Unknown Make,"{'ID': 709, 'MODELNAME': 'Unknown cc', 'Make':..."


In [11]:
# Extract ID and MODELNAME from the dictionaries in the Models column
exploded_df['ModelID'] = exploded_df['Models'].apply(lambda x: x['ID'] if isinstance(x, dict) else None)
exploded_df['ModelName'] = exploded_df['Models'].apply(lambda x: x['MODELNAME'] if isinstance(x, dict) else None)

In [12]:
# Drop the original Models column
df = exploded_df.drop(columns=['Models'])
df

Unnamed: 0,MakeID,Name,ModelID,ModelName
0,1,American Motors,3,Ambassador
0,1,American Motors,5,AMX
0,1,American Motors,9,Eagle
0,1,American Motors,10,Eagle SX-4
0,1,American Motors,7,Hornet/Concord
...,...,...,...,...
84,99,Unknown Make,499,Unknown (light truck)
84,99,Unknown Make,598,Unknown (LSG/NGV)
84,99,Unknown Make,599,Unknown (LSV/NGV)
84,99,Unknown Make,709,Unknown cc


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1738 entries, 0 to 84
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MakeID     1738 non-null   int64 
 1   Name       1738 non-null   object
 2   ModelID    1738 non-null   int64 
 3   ModelName  1738 non-null   object
dtypes: int64(2), object(2)
memory usage: 67.9+ KB


## Bodytype fetching

In [25]:
# Every car needs a body type to query the api with
import os.path
import tqdm
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/definitions/GetVariableAttributesForbodyType"

bodytypes = []

# loop through every row in dataframe
for car in tqdm.tqdm(range(len(df))):
    if os.path.isfile("body-types.json"):
        break
    # for every car in dataframe     df.iloc[0]['A']
    params = f"?variable=bodytype&make={df.iloc[car]['MakeID']}&model={df.iloc[car]['ModelID']}&format=json"
    # get "BODY_ID" from responses and append to each row
    # Get response
    response = requests.get(base_url + params)

    # check if successful
    if response.status_code != 200:
        print(f"Error: Received status code {response.status_code}")
        print(f"Response content: {response.text}")
        raise Exception(f"API request failed with status code {response.status_code}")
    # Turn response into json
    data = response.json()

    # drill down
    results = data['Results'][0]

    # pull data from each bodytype per car
    # format is going to be a list of dictionaries, such that the bodytypes list will be like bodytypes[car][dictionary response]
    extracted = {entry['BODY_DEF'].split('(')[0].strip(): entry['BODY_ID'] for entry in data['Results'][0]}

    # append extracted to main list
    bodytypes.append(extracted)

    # sleep for polite scraping
    time.sleep(.5)

100%|██████████| 1738/1738 [22:43<00:00,  1.27it/s]


In [59]:
if not os.path.isfile("body-types.json"):
    with open("body-types.json", "w") as outfile:
        outfile.write(json.dumps(bodytypes))
else:
    with open('body-types.json', 'r') as openfile:
        bodytypes = json.load(openfile)

In [78]:
print(len(df), len(bodytypes))
unique = []
for dictionary in bodytypes:
    for value in dictionary.values():
        unique.append(int(value))
unique = sorted(set(unique))

1738 1738
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 28, 29, 30, 31, 32, 33, 34, 39, 40, 41, 42, 45, 48, 49, 50, 51, 52, 55, 58, 59, 60, 64, 65, 66, 67, 71, 72, 73, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [96]:
BodyDef = []
BodyId = []
for dictionary in bodytypes:
    for key, value in dictionary.items():
        BodyDef.append(key)
        BodyId.append(int(value))
        break

df['BodyType'] = BodyDef
df['BodyID'] = BodyId
df.head()

Unnamed: 0,MakeID,Name,ModelID,ModelName,BodyType,BodyID
0,1,American Motors,3,Ambassador,"2-door sedan,hardtop,coupe",2
0,1,American Motors,5,AMX,"2-door sedan,hardtop,coupe",2
0,1,American Motors,9,Eagle,Convertible,1
0,1,American Motors,10,Eagle SX-4,"2-door sedan,hardtop,coupe",2
0,1,American Motors,7,Hornet/Concord,Convertible,1


## Getting Crashes Per Year Per Car

In [119]:
# Need to add crash totals per model to above dataframe 
# this will be done by simply tallying responses for each car
# Since the api has a max return limit, querying by each year (2010-onwards) will ensure all data is gathered, and allow for year grouping

# Base URL for NHTSA API
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/crashes/GetCrashesByVehicle"


year_totals = {}
for year in range(2011, 2012): # MAKE 2011-2021!!!!!
    total = []
    for row in tqdm.tqdm(df[['MakeID','ModelID', 'BodyID']].itertuples(index=False, name=None)):
        fatalities = 0
        for state in range(1, 56):
            for model_year in range (2011, 2012): # MAKE 2011-2021!!!!!!
                params = f"?make={row[0]}&model={row[1]}&modelyear={model_year}\
                &bodyType={row[2]}&fromCaseYear={year}&toCaseYear={year}&state={state}&format=json"
                
                # get response(s)
                response = requests.get(base_url + params)
                # check for success/fail
                if response.status_code != 200:
                    print(f"Error: Received status code {response.status_code}")
                    print(f"Response content: {response.text}")
                    raise Exception(f"API request failed with status code {response.status_code}")

                data = response.json()
                # if success increment fatalities
                if data['Message'] == "Results returned successfully":
                    fatalities += 1
                
                # sleep for a few seconds
                time.sleep(.15)
                # end model_year loop
            # end state loop
        total.append(fatalities)
        # end car loop
    year_totals[year] = total
    # end year loop


42it [31:48, 45.45s/it]


KeyboardInterrupt: 

In [107]:
display(year_totals)
# {2011: [23, 34, 12, 55, 23, 4534]}

{}

## Transformation
Now that we have usable, workable data, we can begin cleaning and organizing.

In [None]:
# Transformation code

# Drop any unneeded columns/rows
    # duplicates
    # nulls
    # outliers

# Merge/Join Data into one dataframe



## Load
With curated data, can now be loaded into postgres

In [None]:
# import sql alchemy and stuff

# pull credentials 

# define user things

# setup and load

# query to test