# Grand Circus Final Project
### Car Crash and Safety Data Comparisons/Evaluations

This project aims to compare safety ratings from crash tests to actual data of fatal crashes. The use of fatal crash data is better suited for hard crashes where occupant life is and was in danger, providing more relevant data entries compared to fender benders or other minimal 'traffic incidents'. This analysis could be useful for car buyers, car manufactureres, government testers, and insurance companies.

## Extraction
To start the ETA process, data but be extracted and placed into usable structures. To do this, we will be importing the data from the api(s) and any other flat file sources.

In [None]:
import pandas as pd
import requests
import json
import tqdm

# Begin pulling make names and ID's for internal use
# Definitions endpoint query
make_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/definitions/GetVariableAttributes?variable=make&caseYear=2021&format=json"

# Get response
response = requests.get(make_url)
# Turn response into json
data = response.json()

In [None]:
#Drill down json to list of dictionary
results = data['Results'][0]

In [None]:
# split data into lists
id_list = []
name_list = []
for entry in results:
    id_list.append(int(entry['ID']))
    name_list.append(entry['TEXT'])

# Make columns dictionary based on lists
data = {'MakeID': id_list, 'Name': name_list}

# Create df using dictionary
manufacturer_df = pd.DataFrame(data)

# Sort by Id instead of name
manufacturer_df = manufacturer_df.sort_values(by=['MakeID'])
manufacturer_df.head()

## Only Taking Top 11 Best-Selling Makes
Since the API contains data for all involved in crashes, such as the American Motors Ambassador made from 1952-1974, a fair portion of vehicles are not statistically relevant, or would be outwighed by more common vehicles. To prevent a weighting issing where more prevalent vehicles scew results to thinking more crashes are common, we will be using some of the most popular makes only.

In [None]:
to_keep = ['Nissan/Datsun', 'Toyota', 'KIA', 'Honda', 'Subaru', 'Ford', 'Chevrolet', 'Hyundai', 'Jeep / Kaiser-Jeep / Willys- Jeep', 'GMC', 'Dodge']
new_df = manufacturer_df[manufacturer_df['Name'].isin(to_keep)]
manufacturer_df = new_df
manufacturer_df.rename(columns={'Name': 'MakeName'}, inplace=True)
manufacturer_df.head(10)

### Fetching Model IDs

In [None]:
import time
all_models = []
for make_ID in manufacturer_df['MakeID']:
    model_url = f'https://crashviewer.nhtsa.dot.gov/CrashAPI/definitions/GetVariableAttributesForModel?variable=model&caseYear=2021&make={make_ID}&format=json'
    response = requests.get(model_url)
    model_data = response.json()
    
    results_model = model_data.get('Results') 

    time.sleep(1)
    for model in results_model:
        all_models.append({
            'MakeID': make_ID,
            'Models': model[0:]
        })
# Drill down into JSON
drill_down = all_models[0]['Models']
drill_down

In [None]:
models_df = pd.DataFrame(all_models).sort_values(by='MakeID')

In [None]:
# Merge manufacturer_df & models_df
merged_df = pd.merge(manufacturer_df, models_df, on="MakeID", how="left")
merged_df = merged_df.sort_values(by='MakeID')

In [None]:
# Explode the Models column to separate rows
exploded_df = merged_df.explode('Models')
exploded_df.reset_index(inplace=True)
exploded_df.drop('index', axis=1, inplace=True) 
exploded_df

In [None]:
# Extract ID and MODELNAME from the dictionaries in the Models column
exploded_df['ModelID'] = exploded_df['Models'].apply(lambda x: x['ID'] if isinstance(x, dict) else None)
exploded_df['ModelName'] = exploded_df['Models'].apply(lambda x: x['MODELNAME'] if isinstance(x, dict) else None)

In [None]:
# Drop the original Models column
df = exploded_df.drop(columns=['Models'])
df

In [None]:
df.info()

## Bodytype fetching

In [None]:
# Every car needs a body type to query the api with
import os.path
import tqdm
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/definitions/GetVariableAttributesForbodyType"

bodytypes = []

# loop through every row in dataframe
for car in tqdm.tqdm(range(len(df))):
    if os.path.isfile("body-types.json"):
        break
    # for every car in dataframe     df.iloc[0]['A']
    params = f"?variable=bodytype&make={df.iloc[car]['MakeID']}&model={df.iloc[car]['ModelID']}&format=json"
    # get "BODY_ID" from responses and append to each row
    # Get response
    response = requests.get(base_url + params)

    # check if successful
    if response.status_code != 200:
        print(f"Error: Received status code {response.status_code}")
        print(f"Response content: {response.text}")
        raise Exception(f"API request failed with status code {response.status_code}")
    # Turn response into json
    data = response.json()

    # drill down
    results = data['Results'][0]

    # pull data from each bodytype per car
    # format is going to be a list of dictionaries, such that the bodytypes list will be like bodytypes[car][dictionary response]
    extracted = {entry['BODY_DEF'].split('(')[0].strip(): entry['BODY_ID'] for entry in data['Results'][0]}

    # append extracted to main list
    bodytypes.append(extracted)

    # sleep for polite scraping
    time.sleep(.5)

In [None]:
if not os.path.isfile("body-types.json"):
    with open("body-types.json", "w") as outfile:
        outfile.write(json.dumps(bodytypes))
else:
    with open('body-types.json', 'r') as openfile:
        bodytypes = json.load(openfile)

In [None]:
BodyDef = []
BodyId = []
for dictionary in bodytypes:
    for key, value in dictionary.items():
        BodyDef.append(key)
        BodyId.append(int(value))
        break
        
df['BodyID'] = BodyId
df['BodyType'] = BodyDef

df.info()

## Getting Crashes Per Year Per Car

In [None]:
# Need to add crash totals per model to above dataframe 
# this will be done by simply tallying responses for each car
# Since the api has a max return limit, querying by each year (2010-onwards) will ensure all data is gathered, and allow for year grouping

# Base URL for NHTSA API
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/FARSData/GetFARSData"

# Function to get fatal crash data for a specific year and state
def get_fatal_crashes(year, state):
    params = f"?dataset=Vehicle&FromYear={year}&ToYear={year}&state={state}&format=json"
    response = requests.get(base_url + params)

    # Check for issues
    if response.status_code != 200:
        print(f"Error: Received status code {response.status_code}")
        print(f"Response content: {response.text}")
        raise Exception(f"API request failed with status code {response.status_code}")

    data = response.json()
    if response["Message"] == "Results returned successfully":
        return data.get("Results", [])
    else:
        print(f"Error at api call for {year} and {state}!")


years = range(2010, 2010)
states = range(1, 57)

# Adding year columns to DataFrame for fatal crashes
for year in years:
    df[str(year)] = 0

for year in tqdm.tqdm(years):
    for state in states:
        crash_data = get_fatal_crashes(year, state)
        os.sleep(10)
        
        # Iterate over each vehicle in the crash data
        for vehicle_list in crash_data:
            for vehicle in vehicle_list:  # vehicle_list contains crash details for a particular vehicle
                make = vehicle['MAKENAME']  # We are using 'MAKENAME' from the response
                model = vehicle['MODELNAME']  # We are using 'MODELNAME' from the response
                deaths = int(vehicle['DEATHS'])  # Convert deaths to an integer
                
                # Find the row in the dataframe that matches the make and model
                vehicle_row = df[(df['MakeName'] == make) & (df['ModelName'] == model)]

                # If the vehicle is found, update the deaths for that year
                if not vehicle_row.empty:
                    df.loc[vehicle_row.index, str(year)] += deaths

        # Periodically save the dataframe after processing each state
        df.to_csv("fatal_crashes.csv", mode='w', header=True, index=False)

# Check the updated DataFrame
print(df)

In [None]:
len(year_totals[2011])
# {2011: [23, 34, 12, 55, 23, 4534]}
year_df = pd.DataFrame.from_dict(year_totals)
year_df.head()

In [None]:
year_df.info()
df.info()

In [None]:
df.head()

## Transformation
Now that we have usable, workable data, we can begin cleaning and organizing.

In [None]:
# Transformation code

# Drop any unneeded columns/rows
    # duplicates
    # nulls
    # outliers

# Merge/Join Data into one dataframe



## Load
With curated data, can now be loaded into postgres

In [None]:
# import sql alchemy and stuff
from sqlalchemy import create_engine

with open('credentials.json', 'r') as openfile:
    credentials = json.load(openfile)


TABLE_NAME = 'car_data'

DB_NAME = "safecars"
DB_USER = credentials['user']
DB_PASS = credentials['pass']
DB_HOST = "localhost"
DB_PORT = "5432"

# create engine with defined macros
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}")
# send the df over
#df.to_sql(name=TABLE_NAME,
          con=engine,
          index=False)


In [None]:
sql = "SELECT * FROM car_data" # simple query for all rows
#sql_df = pd.read_sql(sql, engine) # make a df from postgres
#sql_df.head()