In [198]:
import pandas as pd
import sqlite3 as sql
import numpy as np
import re
import string
import os

In [272]:
## join epa vehicle information to adjusted weight information for 2023-2017 years.
epa_df = pd.read_xml("/Users/josheverts/Downloads/vehicles.xml")


## load in data from 2000-2016, stored as .csv files by the epa
path = "/Users/josheverts/Documents/Epa_AdjWeights/Excel"
os.chdir(path)
## data to join with adjusted weights
weight_dfs = []
for file in os.listdir():
    weight_dfs.append(pd.read_excel(file))
# test_data_2023 = pd.read_excel('23-testcar-2022-11-03.xlsx')
# test_data_2022 = pd.read_excel('22-testcar-2023-02-28.xlsx')
# test_data_2021 = pd.read_excel('21-tstcar-2022-04-15.xlsx')
# test_data_2020 = pd.read_excel('20tstcar-2021-03-02.xlsx')
# test_data_2019 = pd.read_excel('19tstcar-2020-10-02.xlsx')
# test_data_2018 = pd.read_excel('18tstcar-2018-10-24.xlsx')
# test_data_2017 = pd.read_excel('17tstcar-2018-05-30.xlsx')

In [None]:
weight_dfs = [test_data_2023, test_data_2022, test_data_2021,
              test_data_2020, test_data_2019, test_data_2018, test_data_2017]

In [273]:
## load in data from 2000-2016, stored as .csv files by the epa
path = "/Users/josheverts/Documents/Epa_AdjWeights/csv"
os.chdir(path)

In [302]:
def clean_index_adj_weights(adj_weight_df, yr):
    if yr >= 10: ## 2010 or later
        adj_weight_df_grouped = adj_weight_df.groupby(['Represented Test Veh Make',
                                                       'Represented Test Veh Model', 
                                                       'Veh Mfr Code'], group_keys = False).mean(numeric_only = True)
        adj_weight_df_grouped = adj_weight_df_grouped.reset_index()
        adj_weight_df_grouped = adj_weight_df_grouped.reset_index()
        adj_weight_df_grouped = adj_weight_df_grouped.rename(columns = {'index':'VehicleID'})
        rename_dict = {'Model Year': 'Year', 'Represented Test Veh Make':'Make', 'Veh Mfr Code': 'MFRCode',
                   'Represented Test Veh Model':'Model', 'Equivalent Test Weight (lbs.)':'AdjWeight'}
        epa_adj_weights = adj_weight_df_grouped.rename(rename_dict, axis=1)  
        epa_adj_weights = epa_adj_weights[['VehicleID', 'Year', 'MFRCode', 'Make', 'Model', 'AdjWeight']]
        epa_adj_weights.astype({'Year': 'int32'})
    else:        
        return
#         adj_weight_df_grouped = adj_weight_df.groupby(['VI_MFR_NM',
#                                                        'Represented Test Veh Model', 
#                                                        'Veh Mfr Code'], group_keys = False).mean(numeric_only = True)
#         adj_weight_df_grouped = adj_weight_df_grouped.reset_index()
#         adj_weight_df_grouped = adj_weight_df_grouped.reset_index()
#         adj_weight_df_grouped = adj_weight_df_grouped.rename(columns = {'index':'VehicleID'})
#         rename_dict = {'Model Year': 'Year', 'Represented Test Veh Make':'Make', 'Veh Mfr Code': 'MFRCode',
#                    'Represented Test Veh Model':'Model', 'Equivalent Test Weight (lbs.)':'AdjWeight'}
#         epa_adj_weights = adj_weight_df_grouped.rename(rename_dict, axis=1)  
#         epa_adj_weights = epa_adj_weights[['VehicleID', 'Year', 'MFRCode', 'Make', 'Model', 'AdjWeight']]
#         epa_adj_weights.astype({'Year': 'int32'})
        
        
    
    return epa_adj_weights

def clean_index_vehicle_data(vehicle_df):
    
    ## rename columns of each df
    rename_dict = {'id': 'VehicleID', 'make': 'Make', 'model':'Model', 'mfrCode':'MFRCode', 'range': 'Range',
              'rangeHwy': 'RangeHwy', 'year':'Year'}
    vehicle_df = vehicle_df.rename(rename_dict, axis=1)
    
    ## vehicle atvType are:
    vehicle_df['atvType'].unique()
    ## if None or Diesel, assign ICE
    ## if Hybrid assign HEV
    ## if Plug-in Hybrid assigh PHEV
    ## if FFV assign FCV
    def convert_vals(vals):
        out = []
        for val in vals:
            if pd.isna(val) == True or val == 'Diesel':
                out.append('ICE')
            elif val == 'Hybrid':
                out.append('HEV')
            elif val == 'Plug-in Hybrid':
                out.append('PHEV')
            elif val == 'EV':
                out.append(val)
            elif val == 'FFV':
                out.append('FCV')
            else:
                out.append(None)
        return out
    new_codes = convert_vals(vehicle_df['atvType'])
    vehicle_df['atvType'] = new_codes
    vehicle_df = vehicle_df[['VehicleID', 'Year', 'MFRCode', 'Make', 'Model', 
                             'Range', 'RangeHwy', 'rangeHwyA','atvType', 
                             'UHighway', 'UCity', 'city08U', 'highway08U', 'combE', 
                             'combinedUF', 'comb08', 'trany', 'cylinders', 'displ', 'baseModel']]
    
    return vehicle_df

def concat_weight_data(weight_dfs, yrs):
    combined = pd.DataFrame()
    for df, yr in zip(weight_dfs, yrs):
        cleaned = clean_index_adj_weights(df, yr)
        combined = pd.concat([combined, cleaned])
    combined['Year'] = combined['Year'].astype('int')
    combined['VehicleID'] = np.arange(0, len(combined)) ## assign new unique ids
    combined = combined.reset_index().drop(['index'], axis = 1)
    return combined
        
    
def vehicle_data_join(vehicle_df, combined_weights):
    vehicle_df = vehicle_df.copy(); combined_weights = combined_weights.copy()
    tempdf = vehicle_df[['VehicleID','Year','Make', 'Model']]
    tempdf2 = vehicle_df[['VehicleID', 'Make', 'Model', 'trany', 'displ', 'cylinders', 'atvType']]
    vehicle_df['Make'] = vehicle_df['Make'].str.lower()
    vehicle_df['baseModel'] = vehicle_df['baseModel'].str.lower()
    vehicle_df['Model'] = vehicle_df['Model'].str.lower()
    combined_weights['Make'] = combined_weights['Make'].str.lower()
    combined_weights['Model'] = combined_weights['Model'].str.lower()
    combined_weights['baseModel'] = [i.split(' ')[0] for i in combined_weights['Model']]
    adj_join = pd.merge(vehicle_df, combined_weights, how='inner', 
                        left_on=['Year', 'Make','baseModel'], right_on = ['Year', 'Make','baseModel'])
    adj_join_g = adj_join.groupby(['VehicleID_x']).agg({'UHighway': np.mean, 'UCity': np.mean, 'city08U': np.mean, 
                                                    'Range': np.mean, 'RangeHwy': np.mean, 'rangeHwyA': np.mean,
                                                    'highway08U':  np.mean, 'combE': np.mean, 'combinedUF': np.mean, 
                                                    'comb08': np.mean, 'AdjWeight': np.mean})  
    adj_join_g_j = pd.merge(adj_join_g, tempdf2, how = 'inner', left_on='VehicleID_x', right_on='VehicleID')
    adj_join_final_cap = pd.merge(adj_join_g_j, tempdf, how = 'inner', left_on='VehicleID', right_on='VehicleID')
    adj_join_final_cap_drop = adj_join_final_cap.drop(['Make_x', 'Model_x'], axis = 1)
    adj_join_final_cap_drop = adj_join_final_cap_drop.rename({'Make_y': 'Make', 'Model_y': 'Model'}, axis = 1)
#     print(adj_join_final_cap_drop.columns)
    adj_join_final_cap_drop = adj_join_final_cap_drop[~adj_join_final_cap_drop.duplicated(['Year', 'Make', 'Model', 'trany', 'displ', 'cylinders'])]
    cols = adj_join_final_cap_drop.columns.to_list()
    cols = cols[::-1]
    adj_join_final_cap_drop = adj_join_final_cap_drop[cols]
    adj_join_final_cap_drop = adj_join_final_cap_drop.reset_index().drop(['index'], axis = 1)
    
    return adj_join_final_cap_drop


In [303]:
## concatenate .csv files 2010 onwards into one df
yrs = []
csv_dfs = []
for file in os.listdir():
    df = pd.DataFrame()
    # Check whether file is in text format or not
    if file.endswith(".csv"):
        yrs.append(int(file[:2]))
        csv_dfs.append(pd.read_csv(file))
combined_csv1016 = concat_weight_data(csv_dfs, yrs)

In [304]:
## join df to epa df
epa_df_cleaned = clean_index_vehicle_data(epa_df)

In [305]:
joined_data_10_16 = vehicle_data_join(epa_df_cleaned, combined_csv1016)

In [307]:
## concatenate .excel files 2017 onwards into one df
combined_df1723 = concat_weight_data(weight_dfs, np.arange(17,24))

In [308]:
joined_data_17_23 = vehicle_data_join(epa_df_cleaned, combined_df1723)

In [310]:
## combine joined dataframes to create years 2010-2023
joined_data_10_23 = pd.concat([joined_data_10_16, joined_data_17_23])
joined_data_10_23['VehicleID'] = np.arange(0, len(joined_data_10_23)) ## assign new unique ids
joined_data_10_23 = joined_data_10_23.reset_index().drop(['index'], axis = 1)

In [313]:
joined_data_10_23.to_csv("adj_weight_data_join_2010-2023.csv", index = False)

In [232]:
row = pd.read_csv("09tstcar.csv").iloc[50]
cols = pd.read_csv("09tstcar.csv").columns
for val, name in zip(row, cols):
    print(str(name) + ": " + str(val))


MDLYR_DT: 2009
VI_MFR_CD: 20
VI_MFR_NM: Chrysler LLC
GBE_INDX_NUM: 223
OV_ID: L8RTK4922
VC_CNFG_NUM: 0
CL_NM: CARAVAN FWD
CLS_TYP_CD: T
GBE_CID_MSR: 241
GBE_PLC_IND_CD: N
VC_RTD_HP_MSR: 253
ECS_CD: nan
ECS_CD_0: nan
ECS_CD_1: nan
ECS_CD_2: nan
ECS_CD_3: nan
EVCS_CD: 102.0
TRNS: L6
DRV_SYS_CD: F
TOD_CD: 2
VC_DSN_ETW_MSR: 5000
VC_CMPRSN_RAT_MSR: 10.3
VC_AXLE_RAT_MSR: 3.25
VC_NV_RAT_MSR: 27.4
TPF_ACHP_IND_CD: Y
TPF_DYNO_HP_MSR: nan
SIL_CD: 1
TST_PRC_CD: 21
TST_PRPS_CD: 31
TST_NUM_ID: 1077668
VEH_FL_TYP_CD: 61
CH_CD: C
AVRG_CD: nan
GT_WT_MSR: nan
CMYT_HC_FE_MSR: 0.045
CMYT_CO_FE_MSR: 0.89
CMYT_CO2_FE_MSR: 414.0
CMYT_NOX_MSR: 0.01
CMYT_PM_MSR: nan
GT_RND_ADJ_QTY: 21.4
TPF_DYN_TRK_A_MSR: 39.0
TPF_DYN_TRK_B_MSR: 0.5805
TPF_DYN_TRK_C_MSR: 0.02248
TPF_EDYN_TRK_A_MSR: 21.25
TPF_EDYN_TRK_B_MSR: 0.17038
TPF_EDYN_TRK_C_MSR: 0.0238
ENG_CD: FA-600
EF_ID: 9CRXJ04.0TN0
VC_CYL_CNT: 6
TPF_MFR_CSTDN_MSR: nan


In [186]:
epa_df = clean_index_vehicle_data(epa_df)

In [187]:
joined_data = vehicle_data_join(epa_df, weight_dfs)

In [189]:
joined_data.to_csv("adj_weight_data_join_2017-2023.csv", index = False)