In [0]:
from pyspark.sql import functions as F
import os, json, warnings, glob
import pandas as pd
import numpy as np
from scipy.stats import norm
from pyspark.sql import SparkSession
from copy import deepcopy
from time import perf_counter
from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import tempfile
from itertools import product
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import calendar
from math import radians, sin, cos, sqrt, atan2, log
tqdm.pandas()
################################################################################################
# Configurations
################################################################################################
session = SparkSession.builder.appName('df').getOrCreate()

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)

state_names = [
    'NJ_NY_PA', 'CT_IL_TX', 'IN_MI_TN', 'AK_AL_AR_AZ_CA', 'CO_DE_DL_FL_GA_HI', 'IA_ID_KS_KY_LA_MA_MD', 
    'ME_MN_MO_MS_MT_NE_NV_NH_NM_NC_ND', 'OH_OK_OR_RI_SC', 'SD_UT_VT_VA_WA_WV_WI_WY'
]
states = [
    ['NJ', 'NY', 'PA'], ['CT', 'IL', 'TX'], ['IN', 'MI', 'TN'], ['AK', 'AL', 'AR', 'AZ', 'CA'], 
    ['CO', 'DE', 'DL', 'FL', 'GA', 'HI'], ['IA', 'ID', 'KS', 'KY', 'LA', 'MA', 'MD'], 
    ['ME', 'MN', 'MO', 'MS', 'MT', 'NE', 'NV', 'NH', 'NM', 'NC', 'ND'], 
    ['OH', 'OK', 'OR', 'RI', 'SC'], ['SD', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
]
end = '2024-11-01'
end_date = datetime.strptime(end, '%Y-%m-%d')
start_date = end_date - relativedelta(years=2)
start_date = datetime(start_date.year, start_date.month, 1)
start = start_date.strftime('%Y-%m-%d')

past_3_months_date = end_date - relativedelta(months=2)
past_3_months_date = datetime(past_3_months_date.year, past_3_months_date.month, 1)
past_3_months = past_3_months_date.strftime('%Y-%m-%d')

brand_rename_dict = {'MERCEDES-BENZ': 'MERCEDES BENZ'}

bad_dealers = {
    0: [], 1: [], 2: [], 3: ['8200 BALDWIN  ST, OAKLAND, ALAMEDA, CA'], 
    4: ['6381 AIRPORT N RD, NAPLES, COLLIER, FL'],
    5: ['7245 U S HIGHWAY 61 N, SAINT FRANCISVILLE, WEST FELICIANA, LA'], 6: [], 7: [], 
    8: ['176 S MAIN  ST, RUTLAND, RUTLAND, VT', '3475 JEFFERSON DAVIS  HWY, FREDERICKSBURG, SPOTSYLVANIA, VA', '801 LIME KILN  RD, GREEN BAY, BROWN, WI']
}
################################################################################################
# Transactions Calculations Functions
################################################################################################

# fill np.nan values in 'NEW_SALES_FINAL' and 'USED_SALES_FINAL_ADJUSTED' with 'NEW_SALES' and 'USED_SALES' of the respective rows
def reassign_fillna(row):
        if pd.isna(row['NEW_SALES_FINAL']):
            row['NEW_SALES_FINAL'] = row['NEW_SALES']
        if pd.isna(row['USED_SALES_FINAL_ADJUSTED']):
            row['USED_SALES_FINAL_ADJUSTED'] = row['USED_SALES']

        return row

# to check the 'NEW_SALES_FINAL' and 'USED_SALES_FINAL_ADJUSTED' values according to certain conditions and replacing with the required values
def check_sales(row):
        if row['YEAR'] == str(end_date.year):
            row['NEW_SALES_ANNUALIZED'] = int(row['NEW_SALES'] * 12/end_date.month)
            row['USED_SALES_ANNUALIZED'] = int(row['USED_SALES'] * 12/end_date.month)
            row['NEW_SALES_FINAL_ANNUALIZED'] = int(row['NEW_SALES_FINAL'] * 12/end_date.month)
            row['USED_SALES_FINAL_ADJUSTED_ANNUALIZED'] = int(row['USED_SALES_FINAL_ADJUSTED'] * 12/end_date.month)
            if (row['NEW_SALES_FINAL_ANNUALIZED'] > 2 * row['NEW_SALES_ANNUALIZED']) & (row['NEW_SALES_FINAL_ANNUALIZED'] > 1000):
                row['NEW_SALES_FINAL'] = row['NEW_SALES']
                row['flag_n'] = 1
            if (row['USED_SALES_FINAL_ADJUSTED_ANNUALIZED'] > 2 * row['USED_SALES_ANNUALIZED']) & (row['USED_SALES_FINAL_ADJUSTED_ANNUALIZED'] > 1000):
                row['USED_SALES_FINAL_ADJUSTED'] = row['USED_SALES']
                row['flag_u'] = 1
        else:
            if (row['NEW_SALES_FINAL'] > 2 * row['NEW_SALES']) & (row['NEW_SALES_FINAL'] > 1000):
                row['NEW_SALES_FINAL'] = row['NEW_SALES']
                row['flag_n'] = 1
            if (row['USED_SALES_FINAL_ADJUSTED'] > 2 * row['USED_SALES']) & (row['USED_SALES_FINAL_ADJUSTED'] > 1000):
                row['USED_SALES_FINAL_ADJUSTED'] = row['USED_SALES']
                row['flag_u'] = 1

        return row

# Annual Transactions: Single brand dealers; TTM Transactions: All dealers
def dealer_transactions_single_brand_v4(dealers, polk_df, reassign_df, end):
    end_date = datetime.strptime(end, '%Y-%m-%d')
    start_date = end_date - relativedelta(months=11)
    start = start_date.strftime('%Y-%m-%d')

    select_polk_df = polk_df.join(dealers.select('DEALER_NAME', 'DEALER_ADDRESS_FULL'), on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
   
    df = select_polk_df.groupBy('DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE').agg(F.sum('COUNT').alias('SALES')).toPandas()
    df['REPORT_YEAR_MONTH'] = pd.to_datetime(df['REPORT_YEAR_MONTH'])

    # Mapping Fuel Types
    fuel_rename_dict = {"Gasoline": "ICE", "Electric": "BEV", "Both gas and electric": "Hybrid"}
    df['FUEL'] = df['FUEL'].apply(lambda x: x if x in ['Gasoline', 'Electric', 'Both gas and electric'] else 'Other').replace(fuel_rename_dict)
    
    # Mapping Segment Types
    cars_dict = {"Compact Car", "Compact Luxury Car", "Full Size Car", "Full Size Luxury Car", "Mid Size Car", "Mid Size Luxury Car", "Subcompact Car", "Subcompact Luxury Car"}
    df['SEGMENT'] = df['SEGMENT'].apply(lambda x: 'Car' if x in cars_dict else 'Truck')

    aggregation_dict = {'SALES': 'sum'}
    # Calculating All data
    df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE', 'REPORT_YEAR_MONTH', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    df['YEAR'] = df['REPORT_YEAR_MONTH'].dt.year
    df['MONTH'] = df['REPORT_YEAR_MONTH'].dt.month
    df['REG_TYPE'] = df['REG_TYPE'].replace({'N': 'New', 'U': 'Used'})

    df = df[[
        'DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MONTH', 'YEAR', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE', 'SALES'
    ]]
    
    # -----------------------------------------------------------------------------
    # # change it
    # # filtering out all the single brand dealers for annual transactions calculations
    # df_f = df[df['REG_TYPE']=='New']
    # list_ = []
    # for name, gp in df_f.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE']):
    #     list_.append(list(name))
    # df_f = pd.DataFrame(list_, columns=['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])

    # # ------------------------
    # df_f_list = df_f.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])['MAKE'].agg(list).reset_index().rename(columns={'MAKE': 'Multi brand list'})
    # # ------------------------

    # df_f = df_f.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL']).count().reset_index()
    # df_single = df_f[df_f['MAKE'] == 1].reset_index(drop=True).drop('MAKE', axis=1)
    # df_single_brand = df.merge(df_single, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='inner')

    # # ------------------------------
    # df_single_brand_list = df_single_brand.merge(df_f_list, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='left')
    # df_001 = df_single_brand_list
    # df_001 = df_001[df_001.apply(lambda row: row['MAKE'] in row['Multi brand list'], axis=1)].drop('Multi brand list', axis=1)
   
    # # ------------------------------
    # # filtering out only those makes which have new sales for all the dealers(single + multi)
    # df_all = df_f.drop('MAKE', axis=1)
    # df_all_brand = df.merge(df_all, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='inner')
    # df_all_brand_list = df_all_brand.merge(df_f_list, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='left')
    # df_all = df_all_brand_list
    # df_all = df_all[df_all.apply(lambda row: row['MAKE'] in row['Multi brand list'], axis=1)].drop('Multi brand list', axis=1).reset_index(drop=True)
    # # ----------------------------------------------------------------------------

    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    #  filtering out all the single brand dealers for annual transactions calculations
    
    # based on dealer_registry_make_sales
    dealers_pd = dealers.toPandas().drop(['MAKE', 'SALES'], axis=1)
    dealers_pd['#_make'] = dealers_pd['MAKE_LIST_NEW_RETAIL'].apply(lambda row: len(row))
    df_single = dealers_pd[dealers_pd['#_make']==1]
    df_single_brand = df.merge(df_single, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='inner')
    df_001 = df_single_brand[df_single_brand.apply(lambda row: row['MAKE'] in row['MAKE_LIST_NEW_RETAIL'], axis=1)].drop(['MAKE_LIST_NEW_RETAIL', 'check_n_retail', 'MAKE_SALES_NEW_RETAIL', '#_make'], axis=1)

    # filtering out only those makes which have new sales for all the dealers(single + multi)
    df_all = dealers_pd
    df_all= df.merge(df_all, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='inner')
    df_all = df_all[df_all.apply(lambda row: row['MAKE'] in row['MAKE_LIST_NEW_RETAIL'], axis=1)].drop(['MAKE_LIST_NEW_RETAIL', 'check_n_retail', 'MAKE_SALES_NEW_RETAIL', '#_make'], axis=1)
    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    # making ttm_df by filtering out the data only for trailing 12 months (for all the dealers [single + multi])
    ttm_df = df_all[
    (df_all['REPORT_YEAR_MONTH'] >= start_date) & (df_all['REPORT_YEAR_MONTH'] <= end_date)]
    # ------------------------------

    # Yearly Data
    print(f"Calculating Yearly Data... ({df['REPORT_YEAR_MONTH'].min()} - {df['REPORT_YEAR_MONTH'].max()})")
    df = df_001
    yr_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    # ----------------------------------------------------------------------
    # reassinging the sales using to our dealers 

    # performing groupby to add all the make-wise sales for multibrand dealers for calculating TTM, as we are not handling multibrand dealers make-wise in TTM
    reassign_df = reassign_df[['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'STATE', 'NEW_SALES', 'USED_SALES',
    'NEW_SALES_FINAL',
    'USED_SALES_FINAL_ADJUSTED', 'YEAR']].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'STATE', 'YEAR']).sum().reset_index()
        
    reassign_df['YEAR'] = reassign_df['YEAR'].astype(int)

    reassign_df['SEGMENT'] = 'All'
    reassign_df['FUEL'] = 'All Types'

    reassign_df = reassign_df[['DEALER_NAME',
    'DEALER_ADDRESS_FULL', 'NEW_SALES', 'USED_SALES',
    'NEW_SALES_FINAL',
    'USED_SALES_FINAL_ADJUSTED',
    'YEAR','SEGMENT', 'FUEL']]

    # -------------------------------------------------------------------------
    df_fil = yr_df[(yr_df['SEGMENT']!='All') & (yr_df['FUEL']!='All Types') & (yr_df['REGISTRATION_TYPE']!='All')]

    # ----------------------------------------------------------------------------
    # df_used = deepcopy(df_fil)
    # df_used['REG_TYPE'] = 'Used'
    # df_used['SALES'] = np.NAN

    for name, group in df_fil.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR']):
        if not 'Used' in group['REG_TYPE'].unique():
            df_used = deepcopy(group)
            df_used['REG_TYPE'] = 'Used'
            df_used['SALES'] = np.NAN
            df_fil = pd.concat([df_fil, df_used], axis=0).sort_values(['SALES'], ascending=False)
    df_merge = df_fil
    # df_merge = pd.concat([df_fil, df_used], axis=0).sort_values(['SALES'], ascending=False)
    # -------------------------------------------------------------------------------
    df_merge = df_merge.sort_values('SALES')
    df_merge = df_merge.drop_duplicates(['DEALER_NAME', 'DEALER_ADDRESS_FULL',
        'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE',])
    # df_merge['WEIGHT'] = int(1)
    df_merge['WEIGHT'] = int(0)
    for name, group in df_merge[df_merge['SALES'].isna()].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR']):  
        group['WEIGHT'] = 1/len(group) 
        df_merge = pd.concat([df_merge, group], axis=0)
    # df_merge = df_merge[~((df_merge['REG_TYPE']=='Used') & (df_merge['WEIGHT']==1) & (pd.isna(df_merge['SALES'])))]
    df_merge = df_merge[~((df_merge['REG_TYPE']=='Used') & (df_merge['WEIGHT']==0) & (pd.isna(df_merge['SALES'])))]

    print('---reassinging sales for yr_df---')

    new_df = pd.DataFrame()
    # change it (add segment fuel)
    for name, group in df_merge.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'REG_TYPE', 'SEGMENT', 'FUEL', 'REGISTRATION_TYPE']):
        if name[3] == 'New':
            # print('new')
            denom1 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2])]['NEW_SALES']
            denom2 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2])]['NEW_SALES_FINAL']

            group['WEIGHT'] = group['SALES']/(denom1.iloc[0] if not denom1.empty else 1)

            group['NEW_SALES'] = group['WEIGHT'] * (denom2.iloc[0] if not denom2.empty else 1)
            # print('new')

        if name[3] == 'Used':
            # if ((group['WEIGHT'].iloc[0]==1) & (~pd.isna(group['SALES'].iloc[0]))):
            if ((group['WEIGHT'].iloc[0]==0) & (~pd.isna(group['SALES'].iloc[0]))):
                denom1 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2])]['USED_SALES']
                group['WEIGHT'] = group['SALES']/(denom1.iloc[0] if (not denom1.empty and denom1.iloc[0] != 0 and not pd.isna(denom1.iloc[0])) else 1)  
                denom2 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2])]['USED_SALES_FINAL_ADJUSTED']
                group['NEW_SALES'] = group['WEIGHT'] * (denom2.iloc[0] if ((not denom2.empty)) else 1)

        new_df = pd.concat([new_df, group], axis=0)

    new_df = new_df.drop(['SALES', 'WEIGHT'], axis=1)
    new_df.rename(columns={'NEW_SALES':'SALES'}, inplace=True)
    new_df['SALES'].replace([np.inf, -np.inf], np.nan, inplace=True)
    new_df['SALES'] = new_df['SALES'].fillna(0)
    new_df['SALES'] = new_df['SALES'].round()
    
    filtered_df = new_df[~((new_df['SEGMENT'] == 'All') & (new_df['FUEL'] == 'All Types') & (new_df['REGISTRATION_TYPE'] == 'All'))]
    
    aggregation_dict = {'SALES': 'sum'}

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'SEGMENT', 'YEAR', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'FUEL', 'YEAR', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    print('resassigned sales for yr_df')

    # --------------------------------------------------------------------------
    # TTM Data
    print('reassigning sales for ttm_df')

    # grouping by 'month' to make ttm
    ttm_df = ttm_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MONTH', 'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index() 

    df_fil_ttm = ttm_df[(ttm_df['SEGMENT']!='All') & (ttm_df['FUEL']!='All Types') & (ttm_df['REGISTRATION_TYPE']!='All')]
    # --------------------------------------------------------------
    # df_used = deepcopy(df_fil_ttm)
    # df_used['REG_TYPE'] = 'Used'
    # df_used['SALES'] = np.NAN
    for name, group in df_fil_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH']):
        if not 'Used' in group['REG_TYPE'].unique():
            df_used = deepcopy(group)
            df_used['REG_TYPE'] = 'Used'
            df_used['SALES'] = np.NAN
            df_fil_ttm = pd.concat([df_fil_ttm, df_used], axis=0).sort_values(['SALES'], ascending=False)
    df_merge = df_fil_ttm
    # df_merge = pd.concat([df_fil_ttm, df_used], axis=0).sort_values(['SALES'], ascending=False)
    # --------------------------------------------------------------
    df_merge = df_merge.sort_values('SALES')
    df_merge = df_merge.drop_duplicates(['DEALER_NAME', 'DEALER_ADDRESS_FULL',
        'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE', 'MONTH'])
    # df_merge['WEIGHT'] = int(1)
    df_merge['WEIGHT'] = int(0)
    for name, group in df_merge[df_merge['SALES'].isna()].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH']):  
        group['WEIGHT'] = 1/len(group) 
        df_merge = pd.concat([df_merge, group], axis=0)
    # df_merge = df_merge[~((df_merge['REG_TYPE']=='Used') & (df_merge['WEIGHT']==1) & (pd.isna(df_merge['SALES'])))]
    df_merge = df_merge[~((df_merge['REG_TYPE']=='Used') & (df_merge['WEIGHT']==0) & (pd.isna(df_merge['SALES'])))]

    new_df = pd.DataFrame()
    for name, group in df_merge.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'REG_TYPE', 'REGISTRATION_TYPE', 'SEGMENT', 'FUEL', 'MONTH']):
        if name[3] == 'New':
            # print('new')
            denom1 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2])]['NEW_SALES']
            denom2 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2])]['NEW_SALES_FINAL']

            group['WEIGHT'] = group['SALES']/(denom1.iloc[0] if not denom1.empty else 1)

            group['NEW_SALES'] = group['WEIGHT'] * (denom2.iloc[0] if not denom2.empty else 1)

        if name[3] == 'Used':
            # if ((group['WEIGHT'].iloc[0]==1) & (~pd.isna(group['SALES'].iloc[0]))):
            if ((group['WEIGHT'].iloc[0]==0) & (~pd.isna(group['SALES'].iloc[0]))):
                denom1 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2])]['USED_SALES']
                group['WEIGHT'] = group['SALES']/(denom1.iloc[0] if (not denom1.empty and denom1.iloc[0] != 0 and not pd.isna(denom1.iloc[0])) else 1)  
                denom2 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2])]['USED_SALES_FINAL_ADJUSTED']
                group['NEW_SALES'] = group['WEIGHT'] * (denom2.iloc[0] if ((not denom2.empty)) else 1)


        new_df = pd.concat([new_df, group], axis=0)

    new_df = new_df.drop(['SALES', 'WEIGHT'], axis=1)
    new_df.rename(columns={'NEW_SALES':'SALES'}, inplace=True)
    new_df['SALES'].replace([np.inf, -np.inf], np.nan, inplace=True)
    new_df['SALES'] = new_df['SALES'].fillna(0)
    
    filtered_df_ttm = new_df[~((new_df['SEGMENT'] == 'All') & (new_df['FUEL'] == 'All Types'))]
    
    aggregation_dict = {'SALES': 'sum'}

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'SEGMENT', 'YEAR', 'MONTH', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'FUEL', 'YEAR', 'MONTH', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH', 'SEGMENT', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)
    filtered_df_ttm['SALES'] = filtered_df_ttm['SALES'].round()

    print('reassigned sales for ttm_df')
    # ----------------------------------------------------------------------------------------------------------

    # Yearly Transactions Calculations
    # ==========================================================================================
    print('calculating annual trans')
    yr_out = []
    yr_df = filtered_df

    # RETAIL New | RETAIL Used
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
    for name, group in grouped_df:
        for i, row in group.iterrows():
            yr_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': row['SEGMENT'],
                'Fuel Type': row['FUEL'],
                'Name': row['REG_TYPE'],
                'Year': int(row['YEAR']),
                'Value': int(row['SALES']),
            })
    
    # Commercial (FLEET New + FLEET Used)
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='FLEET'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            yr_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': sub_name[2],
                'Fuel Type': sub_name[1],
                'Name': 'Commercial',
                'Year': int(sub_name[0]),
                'Value': int(sub_group['SALES'].sum())
            })
    
    # Wholesale (RETAIL Used + FLEET Used)
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='All'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            if 'Used' in sub_group['REG_TYPE'].unique():
                yr_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[2],
                    'Fuel Type': sub_name[1],
                    'Name': 'Wholesale',
                    'Year': int(sub_name[0]),
                    'Value': int(0.15 * sub_group[sub_group['REG_TYPE']=='Used']['SALES'])
                })

    # New/Used Ratio - calculated only for retail sales
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            if len(sub_group) == 2:
                used = int(sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0])
                yr_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[2],
                    'Fuel Type': sub_name[1],
                    'Name': 'New/Used Ratio',
                    'Year': int(sub_name[0]),
                    'Value': (int(sub_group[sub_group['REG_TYPE']=='New']['SALES'].iloc[0]) / used) if used != 0 else 0
                })
                
            if len(sub_group) == 1:
                if 'New' in sub_group['REG_TYPE'].unique():
                    yr_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[2],
                        'Fuel Type': sub_name[1],
                        'Name': 'New/Used Ratio',
                        'Year': int(sub_name[0]),
                        'Value': None
                    })
                
                elif 'Used' in sub_group['REG_TYPE'].unique():
                    yr_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[2],
                        'Fuel Type': sub_name[1],
                        'Name': 'New/Used Ratio',
                        'Year': int(sub_name[0]),
                        'Value': 0
                    })

    print("calculated yearly data")

    # Monthly Transactions Calculations
    # ==========================================================================================
    print('calculating ttm trans')
    ttm_out = []
    ttm_df = filtered_df_ttm
    # RETAIL New | RETAIL Used
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
    for name, group in grouped_df:
        for i, row in group.iterrows():
            ttm_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': row['SEGMENT'],
                'Fuel Type': row['FUEL'],
                'Name': row['REG_TYPE'],
                'Month': int(row['MONTH']),
                'Year': int(row['YEAR']),
                'Value': int(row['SALES']),
            })

    # Commercial (FLEET New + FLEET Used)
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='FLEET'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            ttm_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': sub_name[3],
                'Fuel Type': sub_name[2],
                'Name': 'Commercial',
                'Month': int(sub_name[0]),
                'Year': int(sub_name[1]),
                'Value': int(sub_group['SALES'].sum())
            })

    # Wholesale (RETAIL Used + FLEET Used)
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='All'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            if 'Used' in sub_group['REG_TYPE'].unique():
                ttm_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[3],
                    'Fuel Type': sub_name[2],
                    'Name': 'Wholesale',
                    'Month': int(sub_name[0]),
                    'Year': int(sub_name[1]),
                    'Value': int(0.15 * sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0])
                })

    # New/Used Ratio - calculated only for retail sales
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            if len(sub_group) == 2:
                used = int(sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0])
                ttm_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[3],
                    'Fuel Type': sub_name[2],
                    'Name': 'New/Used Ratio',
                    'Month': int(sub_name[0]),
                    'Year': int(sub_name[1]),
                    'Value': (int(sub_group[sub_group['REG_TYPE']=='New']['SALES'].iloc[0]) / used) if used != 0 else 0
                })

            if len(sub_group) == 1:
                if 'New' in sub_group['REG_TYPE'].unique():
                    ttm_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[3],
                        'Fuel Type': sub_name[2],
                        'Name': 'New/Used Ratio',
                        'Month': int(sub_name[0]),
                        'Year': int(sub_name[1]),
                        'Value': None
                    })

                elif 'Used' in sub_group['REG_TYPE'].unique():
                    ttm_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[3],
                        'Fuel Type': sub_name[2],
                        'Name': 'New/Used Ratio',
                        'Month': int(sub_name[0]),
                        'Year': int(sub_name[1]),
                        'Value': 0
                    })
    print("calculated ttm data")

    yr_data = pd.DataFrame(yr_out)
    ttm_data = pd.DataFrame(ttm_out)

    yr_data = yr_data.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL']).apply(
        lambda x: x[['Year', 'Vehicle Type', 'Fuel Type', 'Name', 'Value']].to_dict('records')
    ).reset_index(name='Transactions')

    ttm_data = ttm_data.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL']).apply(
        lambda x: x[['Month', 'Year', 'Vehicle Type', 'Fuel Type', 'Name', 'Value']].to_dict('records')
    ).reset_index(name='Transactions - TTM')
    
    print("len of yr_data:", len(yr_data))
    print("len of ttm_data:", len(ttm_data))

    data = pd.merge(yr_data, ttm_data, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='outer')

    return data

def dealer_transactions_single_brand_v5(dealers, polk_df, reassign_df, end):
    end_date = datetime.strptime(end, '%Y-%m-%d')
    start_date = end_date - relativedelta(months=11)
    start = start_date.strftime('%Y-%m-%d')

    select_polk_df = polk_df.join(dealers.select('DEALER_NAME', 'DEALER_ADDRESS_FULL'), on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
   
    df = select_polk_df.groupBy('DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE').agg(F.sum('COUNT').alias('SALES')).toPandas()
    df['REPORT_YEAR_MONTH'] = pd.to_datetime(df['REPORT_YEAR_MONTH'])

    # Mapping Fuel Types
    fuel_rename_dict = {"Gasoline": "ICE", "Electric": "BEV", "Both gas and electric": "Hybrid"}
    df['FUEL'] = df['FUEL'].apply(lambda x: x if x in ['Gasoline', 'Electric', 'Both gas and electric'] else 'Other').replace(fuel_rename_dict)
    
    # Mapping Segment Types
    cars_dict = {"Compact Car", "Compact Luxury Car", "Full Size Car", "Full Size Luxury Car", "Mid Size Car", "Mid Size Luxury Car", "Subcompact Car", "Subcompact Luxury Car"}
    df['SEGMENT'] = df['SEGMENT'].apply(lambda x: 'Car' if x in cars_dict else 'Truck')

    aggregation_dict = {'SALES': 'sum'}
    # Calculating All data
    df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE', 'REPORT_YEAR_MONTH', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    df['YEAR'] = df['REPORT_YEAR_MONTH'].dt.year
    df['MONTH'] = df['REPORT_YEAR_MONTH'].dt.month
    df['REG_TYPE'] = df['REG_TYPE'].replace({'N': 'New', 'U': 'Used'})

    df = df[[
        'DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MONTH', 'YEAR', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE', 'SALES'
    ]]
    
    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    #  filtering out all the single brand dealers for annual transactions calculations
    
    # based on dealer_registry_make_sales
    dealers_pd = dealers.toPandas().drop(['MAKE', 'SALES'], axis=1)
    dealers_pd['#_make'] = dealers_pd['MAKE_LIST_NEW_RETAIL'].apply(lambda row: len(row))
    df_single = dealers_pd[dealers_pd['#_make']==1]
    df_single_brand = df.merge(df_single, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='inner')
    df_001 = df_single_brand[df_single_brand.apply(lambda row: row['MAKE'] in row['MAKE_LIST_NEW_RETAIL'], axis=1)].drop(['MAKE_LIST_NEW_RETAIL', 'check_n_retail', 'MAKE_SALES_NEW_RETAIL', '#_make'], axis=1)

    # filtering out only those makes which have new sales for all the dealers(single + multi)
    df_all = dealers_pd
    df_all= df.merge(df_all, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='inner')
    df_all = df_all[df_all.apply(lambda row: row['MAKE'] in row['MAKE_LIST_NEW_RETAIL'], axis=1)].drop(['MAKE_LIST_NEW_RETAIL', 'check_n_retail', 'MAKE_SALES_NEW_RETAIL', '#_make'], axis=1)
    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    # making ttm_df by filtering out the data only for trailing 12 months (for all the dealers [single + multi])
    ttm_df = df_all[
    (df_all['REPORT_YEAR_MONTH'] >= start_date) & (df_all['REPORT_YEAR_MONTH'] <= end_date)]
    # ------------------------------

    # Yearly Data
    df = df_001
    yr_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    # ----------------------------------------------------------------------
    # reassinging the sales using to our dealers 
        
    reassign_df['YEAR'] = reassign_df['YEAR'].astype(int)

    reassign_df['SEGMENT'] = 'All'
    reassign_df['FUEL'] = 'All Types'

    reassign_df = reassign_df[['DEALER_NAME',
    'DEALER_ADDRESS_FULL', 'NEW_SALES', 'USED_SALES',
    'NEW_SALES_FINAL',
    'USED_SALES_FINAL_ADJUSTED',
    'YEAR','SEGMENT', 'FUEL', 'MAKE']]

    # -------------------------------------------------------------------------
    df_fil = yr_df[(yr_df['SEGMENT']!='All') & (yr_df['FUEL']!='All Types') & (yr_df['REGISTRATION_TYPE']!='All')]

    for name, group in df_fil.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MAKE']):
        if not 'Used' in group['REG_TYPE'].unique():
            df_used = deepcopy(group)
            df_used['REG_TYPE'] = 'Used'
            df_used['SALES'] = np.NAN
            df_fil = pd.concat([df_fil, df_used], axis=0).sort_values(['SALES'], ascending=False)
    df_merge = df_fil
    df_merge = df_merge.sort_values('SALES')
    df_merge = df_merge.drop_duplicates(['DEALER_NAME', 'DEALER_ADDRESS_FULL',
        'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE', 'MAKE'])
    df_merge['WEIGHT'] = int(0)
    for name, group in df_merge[df_merge['SALES'].isna()].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MAKE']):  
        group['WEIGHT'] = 1/len(group) 
        df_merge = pd.concat([df_merge, group], axis=0)
    df_merge = df_merge[~((df_merge['REG_TYPE']=='Used') & (df_merge['WEIGHT']==0) & (pd.isna(df_merge['SALES'])))]

    print('reassinging sales for yr_df')

    new_df = pd.DataFrame()
    for name, group in df_merge.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'REG_TYPE', 'SEGMENT', 'MAKE', 'FUEL', 'REGISTRATION_TYPE']):
        if name[3] == 'New':
            denom1 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2]) & (reassign_df['MAKE']==name[5])]['NEW_SALES']
            denom2 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2]) & (reassign_df['MAKE']==name[5])]['NEW_SALES_FINAL']

            group['WEIGHT'] = group['SALES']/(denom1.iloc[0] if not denom1.empty else 1)

            group['NEW_SALES'] = group['WEIGHT'] * (denom2.iloc[0] if not denom2.empty else 1)

        if name[3] == 'Used':
            if ((group['WEIGHT'].iloc[0]==0) & (~pd.isna(group['SALES'].iloc[0]))):
                denom1 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2]) & (reassign_df['MAKE']==name[5])]['USED_SALES']
                group['WEIGHT'] = group['SALES']/(denom1.iloc[0] if (not denom1.empty and denom1.iloc[0] != 0 and not pd.isna(denom1.iloc[0])) else 1)  
                denom2 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2]) & (reassign_df['MAKE']==name[5])]['USED_SALES_FINAL_ADJUSTED']
                group['NEW_SALES'] = group['WEIGHT'] * (denom2.iloc[0] if ((not denom2.empty)) else 1)

        new_df = pd.concat([new_df, group], axis=0)

    new_df = new_df.drop(['SALES', 'WEIGHT'], axis=1)
    new_df.rename(columns={'NEW_SALES':'SALES'}, inplace=True)
    new_df['SALES'].replace([np.inf, -np.inf], np.nan, inplace=True)
    new_df['SALES'] = new_df['SALES'].fillna(0)
    new_df['SALES'] = new_df['SALES'].round()
    
    filtered_df = new_df[~((new_df['SEGMENT'] == 'All') & (new_df['FUEL'] == 'All Types') & (new_df['REGISTRATION_TYPE'] == 'All'))]
    
    aggregation_dict = {'SALES': 'sum'}

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'SEGMENT', 'YEAR', 'MAKE', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'FUEL', 'YEAR', 'MAKE', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'SEGMENT', 'MAKE', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    print('resassigned sales for yr_df')

    # --------------------------------------------------------------------------
    # TTM Data
    print('reassigning sales for ttm_df')

    # grouping by 'month' to make ttm
    ttm_df = ttm_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MONTH', 'YEAR', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index() 

    df_fil_ttm = ttm_df[(ttm_df['SEGMENT']!='All') & (ttm_df['FUEL']!='All Types') & (ttm_df['REGISTRATION_TYPE']!='All')]
    for name, group in df_fil_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH', 'MAKE']):
        if not 'Used' in group['REG_TYPE'].unique():
            df_used = deepcopy(group)
            df_used['REG_TYPE'] = 'Used'
            df_used['SALES'] = np.NAN
            df_fil_ttm = pd.concat([df_fil_ttm, df_used], axis=0).sort_values(['SALES'], ascending=False)
    df_merge = df_fil_ttm
    df_merge = df_merge.sort_values('SALES')
    df_merge = df_merge.drop_duplicates(['DEALER_NAME', 'DEALER_ADDRESS_FULL',
        'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE', 'MONTH', 'MAKE'])
    df_merge['WEIGHT'] = int(0)
    for name, group in df_merge[df_merge['SALES'].isna()].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH', 'MAKE']):  
        group['WEIGHT'] = 1/len(group) 
        df_merge = pd.concat([df_merge, group], axis=0)
    df_merge = df_merge[~((df_merge['REG_TYPE']=='Used') & (df_merge['WEIGHT']==0) & (pd.isna(df_merge['SALES'])))]

    new_df = pd.DataFrame()
    for name, group in df_merge.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'REG_TYPE', 'REGISTRATION_TYPE', 'MAKE', 'SEGMENT', 'FUEL', 'MONTH']):
        if name[3] == 'New':
            # print('new')
            denom1 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2]) & (reassign_df['MAKE']==name[5])]['NEW_SALES']
            denom2 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2]) & (reassign_df['MAKE']==name[5])]['NEW_SALES_FINAL']

            group['WEIGHT'] = group['SALES']/(denom1.iloc[0] if not denom1.empty else 1)

            group['NEW_SALES'] = group['WEIGHT'] * (denom2.iloc[0] if not denom2.empty else 1)

        if name[3] == 'Used':
            # if ((group['WEIGHT'].iloc[0]==1) & (~pd.isna(group['SALES'].iloc[0]))):
            if ((group['WEIGHT'].iloc[0]==0) & (~pd.isna(group['SALES'].iloc[0]))):
                denom1 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2]) & (reassign_df['MAKE']==name[5])]['USED_SALES']
                group['WEIGHT'] = group['SALES']/(denom1.iloc[0] if (not denom1.empty and denom1.iloc[0] != 0 and not pd.isna(denom1.iloc[0])) else 1)  
                denom2 = reassign_df[(reassign_df['DEALER_NAME']==name[0]) & (reassign_df['DEALER_ADDRESS_FULL']==name[1]) & (reassign_df['YEAR']==name[2]) & (reassign_df['MAKE']==name[5])]['USED_SALES_FINAL_ADJUSTED']
                group['NEW_SALES'] = group['WEIGHT'] * (denom2.iloc[0] if ((not denom2.empty)) else 1)


        new_df = pd.concat([new_df, group], axis=0)

    new_df = new_df.drop(['SALES', 'WEIGHT'], axis=1)
    new_df.rename(columns={'NEW_SALES':'SALES'}, inplace=True)
    new_df['SALES'].replace([np.inf, -np.inf], np.nan, inplace=True)
    new_df['SALES'] = new_df['SALES'].fillna(0)
    
    filtered_df_ttm = new_df[~((new_df['SEGMENT'] == 'All') & (new_df['FUEL'] == 'All Types'))]
    
    aggregation_dict = {'SALES': 'sum'}

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'SEGMENT', 'YEAR', 'MAKE', 'MONTH', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'FUEL', 'YEAR', 'MONTH', 'MAKE', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)
    filtered_df_ttm['SALES'] = filtered_df_ttm['SALES'].round()

    print('reassigned sales for ttm_df')
    # ----------------------------------------------------------------------------------------------------------

    # Yearly Transactions Calculations
    # ==========================================================================================
    print('calculating annual trans')
    yr_out = []
    yr_df = filtered_df

    # RETAIL New | RETAIL Used
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for i, row in group.iterrows():
            yr_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': row['SEGMENT'],
                'Fuel Type': row['FUEL'],
                'Name': row['REG_TYPE'],
                'Year': int(row['YEAR']),
                'Value': int(row['SALES']),
                'MAKE': name[2],
            })
    
    # Commercial (FLEET New + FLEET Used)
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='FLEET'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            yr_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': sub_name[2],
                'Fuel Type': sub_name[1],
                'Name': 'Commercial',
                'Year': int(sub_name[0]),
                'Value': int(sub_group['SALES'].sum()),
                'MAKE': name[2],
            })
    
    # Wholesale (RETAIL Used + FLEET Used)
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='All'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            if 'Used' in sub_group['REG_TYPE'].unique():
                yr_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[2],
                    'Fuel Type': sub_name[1],
                    'Name': 'Wholesale',
                    'Year': int(sub_name[0]),
                    'Value': int(0.15 * sub_group[sub_group['REG_TYPE']=='Used']['SALES']),
                    'MAKE': name[2],
                })

    # New/Used Ratio - calculated only for retail sales
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            if len(sub_group) == 2:
                used = int(sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0])
                yr_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[2],
                    'Fuel Type': sub_name[1],
                    'Name': 'New/Used Ratio',
                    'Year': int(sub_name[0]),
                    'Value': (int(sub_group[sub_group['REG_TYPE']=='New']['SALES'].iloc[0]) / used) if used != 0 else 0,
                    'MAKE': name[2],
                })
                
            if len(sub_group) == 1:
                if 'New' in sub_group['REG_TYPE'].unique():
                    yr_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[2],
                        'Fuel Type': sub_name[1],
                        'Name': 'New/Used Ratio',
                        'Year': int(sub_name[0]),
                        'Value': None,
                        'MAKE': name[2],
                    })
                
                elif 'Used' in sub_group['REG_TYPE'].unique():
                    yr_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[2],
                        'Fuel Type': sub_name[1],
                        'Name': 'New/Used Ratio',
                        'Year': int(sub_name[0]),
                        'Value': 0,
                        'MAKE': name[2],
                    })

    print("calculated yearly data")

    # Monthly Transactions Calculations
    # ==========================================================================================
    print('calculating ttm trans')
    ttm_out = []
    ttm_df = filtered_df_ttm
    # RETAIL New | RETAIL Used
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for i, row in group.iterrows():
            ttm_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': row['SEGMENT'],
                'Fuel Type': row['FUEL'],
                'Name': row['REG_TYPE'],
                'Month': int(row['MONTH']),
                'Year': int(row['YEAR']),
                'Value': int(row['SALES']),
                'MAKE': name[2],
            })

    # Commercial (FLEET New + FLEET Used)
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='FLEET'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            ttm_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': sub_name[3],
                'Fuel Type': sub_name[2],
                'Name': 'Commercial',
                'Month': int(sub_name[0]),
                'Year': int(sub_name[1]),
                'Value': int(sub_group['SALES'].sum()),
                'MAKE': name[2],
            })

    # Wholesale (RETAIL Used + FLEET Used)
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='All'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            if 'Used' in sub_group['REG_TYPE'].unique():
                ttm_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[3],
                    'Fuel Type': sub_name[2],
                    'Name': 'Wholesale',
                    'Month': int(sub_name[0]),
                    'Year': int(sub_name[1]),
                    'Value': int(0.15 * sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0]),
                    'MAKE': name[2],
                })

    # New/Used Ratio - calculated only for retail sales
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            if len(sub_group) == 2:
                used = int(sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0])
                ttm_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[3],
                    'Fuel Type': sub_name[2],
                    'Name': 'New/Used Ratio',
                    'Month': int(sub_name[0]),
                    'Year': int(sub_name[1]),
                    'Value': (int(sub_group[sub_group['REG_TYPE']=='New']['SALES'].iloc[0]) / used) if used != 0 else 0,
                    'MAKE': name[2],
                })

            if len(sub_group) == 1:
                if 'New' in sub_group['REG_TYPE'].unique():
                    ttm_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[3],
                        'Fuel Type': sub_name[2],
                        'Name': 'New/Used Ratio',
                        'Month': int(sub_name[0]),
                        'Year': int(sub_name[1]),
                        'Value': None,
                        'MAKE': name[2],
                    })

                elif 'Used' in sub_group['REG_TYPE'].unique():
                    ttm_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[3],
                        'Fuel Type': sub_name[2],
                        'Name': 'New/Used Ratio',
                        'Month': int(sub_name[0]),
                        'Year': int(sub_name[1]),
                        'Value': 0,
                        'MAKE': name[2],
                    })
    print("calculated ttm data")

    yr_data = pd.DataFrame(yr_out)
    ttm_data = pd.DataFrame(ttm_out)

    yr_data = yr_data.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE']).apply(
        lambda x: x[['Year', 'Vehicle Type', 'Fuel Type', 'Name', 'Value']].to_dict('records')
    ).reset_index(name='Transactions')

    ttm_data = ttm_data.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE']).apply(
        lambda x: x[['Month', 'Year', 'Vehicle Type', 'Fuel Type', 'Name', 'Value']].to_dict('records')
    ).reset_index(name='Transactions - TTM')
    
    print("len of yr_data:", len(yr_data))
    print("len of ttm_data:", len(ttm_data))

    data = pd.merge(yr_data, ttm_data, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'], how='outer')

    return data

def dealer_transactions_single_brand_v5_new(dealers, polk_df, reassign_df, end):
    end_date = datetime.strptime(end, '%Y-%m-%d')
    start_date = end_date - relativedelta(months=11)
    start = start_date.strftime('%Y-%m-%d')

    select_polk_df = polk_df.join(dealers.select('DEALER_NAME', 'DEALER_ADDRESS_FULL'), on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
   
    df = select_polk_df.groupBy('DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE').agg(F.sum('COUNT').alias('SALES')).toPandas()
    df['REPORT_YEAR_MONTH'] = pd.to_datetime(df['REPORT_YEAR_MONTH'])

    # Mapping Fuel Types
    fuel_rename_dict = {"Gasoline": "ICE", "Electric": "BEV", "Both gas and electric": "Hybrid"}
    df['FUEL'] = df['FUEL'].apply(lambda x: x if x in ['Gasoline', 'Electric', 'Both gas and electric'] else 'Other').replace(fuel_rename_dict)
    
    # Mapping Segment Types
    cars_dict = {"Compact Car", "Compact Luxury Car", "Full Size Car", "Full Size Luxury Car", "Mid Size Car", "Mid Size Luxury Car", "Subcompact Car", "Subcompact Luxury Car"}
    df['SEGMENT'] = df['SEGMENT'].apply(lambda x: 'Car' if x in cars_dict else 'Truck')

    aggregation_dict = {'SALES': 'sum'}
    # Calculating All data
    df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'SEGMENT', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    df = pd.concat([df, sum_df], ignore_index=True).reset_index(drop=True)

    df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE', 'REPORT_YEAR_MONTH', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    df['YEAR'] = df['REPORT_YEAR_MONTH'].dt.year
    df['MONTH'] = df['REPORT_YEAR_MONTH'].dt.month
    df['REG_TYPE'] = df['REG_TYPE'].replace({'N': 'New', 'U': 'Used'})

    df = df[[
        'DEALER_NAME', 'DEALER_ADDRESS_FULL', 'REPORT_YEAR_MONTH', 'MONTH', 'YEAR', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE', 'SALES'
    ]]
    
    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    #  filtering out all the single brand dealers for annual transactions calculations
    
    # based on dealer_registry_make_sales
    dealers_pd = dealers.toPandas().drop(['MAKE', 'SALES'], axis=1)
    dealers_pd['#_make'] = dealers_pd['MAKE_LIST_NEW_RETAIL'].apply(lambda row: len(row))
    df_single = dealers_pd[dealers_pd['#_make']==1]
    df_single_brand = df.merge(df_single, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='inner')
    df_001 = df_single_brand[df_single_brand.apply(lambda row: row['MAKE'] in row['MAKE_LIST_NEW_RETAIL'], axis=1)].drop(['MAKE_LIST_NEW_RETAIL', 'check_n_retail', 'MAKE_SALES_NEW_RETAIL', '#_make'], axis=1)

    # filtering out only those makes which have new sales for all the dealers(single + multi)
    df_all = dealers_pd
    df_all= df.merge(df_all, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'], how='inner')
    df_all = df_all[df_all.apply(lambda row: row['MAKE'] in row['MAKE_LIST_NEW_RETAIL'], axis=1)].drop(['MAKE_LIST_NEW_RETAIL', 'check_n_retail', 'MAKE_SALES_NEW_RETAIL', '#_make'], axis=1)
    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    # making ttm_df by filtering out the data only for trailing 12 months (for all the dealers [single + multi])
    ttm_df = df_all[
    (df_all['REPORT_YEAR_MONTH'] >= start_date) & (df_all['REPORT_YEAR_MONTH'] <= end_date)]
    # ------------------------------

    # Yearly Data
    df = df_001
    yr_df = df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()

    # ----------------------------------------------------------------------
    # reassinging the sales using to our dealers 
        
    reassign_df['YEAR'] = reassign_df['YEAR'].astype(int)

    reassign_df = reassign_df[['DEALER_NAME',
    'DEALER_ADDRESS_FULL', 'NEW_SALES', 'USED_SALES',
    'NEW_SALES_FINAL',
    'USED_SALES_FINAL_ADJUSTED',
    'YEAR',
    # 'SEGMENT', 'FUEL', 
    'MAKE']]

    # -------------------------------------------------------------------------
    df_fil = yr_df[(yr_df['SEGMENT']!='All') & (yr_df['FUEL']!='All Types') & (yr_df['REGISTRATION_TYPE']!='All')]

    used_dfs = []

    for name, group in df_fil.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MAKE']):
        if not 'Used' in group['REG_TYPE'].unique():
            df_used = deepcopy(group)
            df_used['REG_TYPE'] = 'Used'
            df_used['SALES'] = np.NAN
            used_dfs.append(df_used)

    if used_dfs:
        df_fil = pd.concat([df_fil, *used_dfs], axis=0).sort_values(['SALES'], ascending=False)

    df_merge = df_fil
    df_merge = df_merge.sort_values('SALES')
    df_merge = df_merge.drop_duplicates(['DEALER_NAME', 'DEALER_ADDRESS_FULL',
        'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE', 'MAKE'])
    df_merge['WEIGHT'] = int(0)
    for name, group in df_merge[df_merge['SALES'].isna()].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MAKE']):  
        group['WEIGHT'] = 1/len(group) 
        df_merge = pd.concat([df_merge, group], axis=0)
    df_merge = df_merge[~((df_merge['REG_TYPE']=='Used') & (df_merge['WEIGHT']==0) & (pd.isna(df_merge['SALES'])))]

    print('reassinging sales for yr_df')

    new_df = pd.DataFrame()

    new_groups = []
    df_merge = pd.merge(df_merge, reassign_df, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MAKE'], how='left')

    for name, group in df_merge.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'REG_TYPE', 'SEGMENT', 'MAKE', 'FUEL', 'REGISTRATION_TYPE']):
        if name[3] == 'New':
            group['WEIGHT'] = group['SALES']/group['NEW_SALES']
            group['REASSIGNED_SALES'] = group['WEIGHT'] * group['NEW_SALES_FINAL']

        if name[3] == 'Used':
            if ((group['WEIGHT'].iloc[0]==0) & (~pd.isna(group['SALES'].iloc[0]))):
                group['WEIGHT'] = group['SALES']/group['USED_SALES']
                group['REASSIGNED_SALES'] = group['WEIGHT'] * group['USED_SALES_FINAL_ADJUSTED']
            
        new_groups.append(group)

    new_df = pd.concat([new_df, *new_groups], axis=0)
    new_df = new_df.drop(['SALES', 'WEIGHT', 'NEW_SALES', 'USED_SALES', 'NEW_SALES_FINAL', 'USED_SALES_FINAL_ADJUSTED'], axis=1).rename(columns={'REASSIGNED_SALES':'SALES'})
    new_df['SALES'].replace([np.inf, -np.inf], np.nan, inplace=True)
    new_df['SALES'] = new_df['SALES'].fillna(0)
    new_df['SALES'] = new_df['SALES'].round()
    
    filtered_df = new_df[~((new_df['SEGMENT'] == 'All') & (new_df['FUEL'] == 'All Types') & (new_df['REGISTRATION_TYPE'] == 'All'))]
    
    aggregation_dict = {'SALES': 'sum'}

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'SEGMENT', 'YEAR', 'MAKE', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'FUEL', 'YEAR', 'MAKE', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = filtered_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'SEGMENT', 'MAKE', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    filtered_df = pd.concat([filtered_df, sum_df], ignore_index=True).reset_index(drop=True)

    print('resassigned sales for yr_df')

    # --------------------------------------------------------------------------
    # TTM Data
    print('reassigning sales for ttm_df')

    # grouping by 'month' to make ttm
    ttm_df = ttm_df.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MONTH', 'YEAR', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index() 

    df_fil_ttm = ttm_df[(ttm_df['SEGMENT']!='All') & (ttm_df['FUEL']!='All Types') & (ttm_df['REGISTRATION_TYPE']!='All')]

    used_dfs = []
    
    for name, group in df_fil_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH', 'MAKE']):
        if not 'Used' in group['REG_TYPE'].unique():
            df_used = deepcopy(group)
            df_used['REG_TYPE'] = 'Used'
            df_used['SALES'] = np.NAN
            used_dfs.append(df_used)
    
    if used_dfs:
        df_fil_ttm = pd.concat([df_fil_ttm, *used_dfs], axis=0).sort_values(['SALES'], ascending=False)

    df_merge = df_fil_ttm
    df_merge = df_merge.sort_values('SALES')
    df_merge = df_merge.drop_duplicates(['DEALER_NAME', 'DEALER_ADDRESS_FULL',
        'YEAR', 'SEGMENT', 'FUEL', 'REG_TYPE', 'REGISTRATION_TYPE', 'MONTH', 'MAKE'])
    df_merge['WEIGHT'] = int(0)
    for name, group in df_merge[df_merge['SALES'].isna()].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH', 'MAKE']):  
        group['WEIGHT'] = 1/len(group) 
        df_merge = pd.concat([df_merge, group], axis=0)
    df_merge = df_merge[~((df_merge['REG_TYPE']=='Used') & (df_merge['WEIGHT']==0) & (pd.isna(df_merge['SALES'])))]

    new_df = pd.DataFrame()
    new_groups = []
    df_merge = pd.merge(df_merge, reassign_df, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MAKE'], how='left')
    for name, group in df_merge.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'REG_TYPE', 'REGISTRATION_TYPE', 'MAKE', 'SEGMENT', 'FUEL', 'MONTH']):
        if name[3] == 'New':
            group['WEIGHT'] = group['SALES']/group['NEW_SALES']
            group['REASSIGNED_SALES'] = group['WEIGHT'] * group['NEW_SALES_FINAL']

        if name[3] == 'Used':
            if ((group['WEIGHT'].iloc[0]==0) & (~pd.isna(group['SALES'].iloc[0]))):
                group['WEIGHT'] = group['SALES']/group['USED_SALES']
                group['REASSIGNED_SALES'] = group['WEIGHT'] * group['USED_SALES_FINAL_ADJUSTED']

        new_groups.append(group)

    new_df = pd.concat([new_df, *new_groups], axis=0)
    new_df = new_df.drop(['SALES', 'WEIGHT', 'NEW_SALES', 'USED_SALES', 'NEW_SALES_FINAL', 'USED_SALES_FINAL_ADJUSTED'], axis=1).rename(columns={'REASSIGNED_SALES':'SALES'})
    new_df['SALES'].replace([np.inf, -np.inf], np.nan, inplace=True)
    new_df['SALES'] = new_df['SALES'].fillna(0)
    
    filtered_df_ttm = new_df[~((new_df['SEGMENT'] == 'All') & (new_df['FUEL'] == 'All Types'))]
    
    aggregation_dict = {'SALES': 'sum'}

    # Fuel = All (ICE + BEV + Hybrid + Other)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'SEGMENT', 'YEAR', 'MAKE', 'MONTH', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['FUEL'] = 'All Types'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)

    # Segment = All (Car + Truck)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'FUEL', 'YEAR', 'MONTH', 'MAKE', 'REG_TYPE', 'REGISTRATION_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['SEGMENT'] = 'All'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)

    # Registration = All (Retail + Fleet)
    sum_df = filtered_df_ttm.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'YEAR', 'MONTH', 'MAKE', 'SEGMENT', 'FUEL', 'REG_TYPE']).agg(aggregation_dict).reset_index()
    sum_df['REGISTRATION_TYPE'] = 'All'
    filtered_df_ttm = pd.concat([filtered_df_ttm, sum_df], ignore_index=True).reset_index(drop=True)
    filtered_df_ttm['SALES'] = filtered_df_ttm['SALES'].round()

    print('reassigned sales for ttm_df')
    # ----------------------------------------------------------------------------------------------------------

    # Yearly Transactions Calculations
    # ==========================================================================================
    print('calculating annual trans')
    yr_out = []
    yr_df = filtered_df

    # RETAIL New | RETAIL Used
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for i, row in group.iterrows():
            yr_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': row['SEGMENT'],
                'Fuel Type': row['FUEL'],
                'Name': row['REG_TYPE'],
                'Year': int(row['YEAR']),
                'Value': int(row['SALES']),
                'MAKE': name[2],
            })
    
    # Commercial (FLEET New + FLEET Used)
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='FLEET'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            yr_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': sub_name[2],
                'Fuel Type': sub_name[1],
                'Name': 'Commercial',
                'Year': int(sub_name[0]),
                'Value': int(sub_group['SALES'].sum()),
                'MAKE': name[2],
            })
    
    # Wholesale (RETAIL Used + FLEET Used)
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='All'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            if 'Used' in sub_group['REG_TYPE'].unique():
                yr_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[2],
                    'Fuel Type': sub_name[1],
                    'Name': 'Wholesale',
                    'Year': int(sub_name[0]),
                    'Value': int(0.15 * sub_group[sub_group['REG_TYPE']=='Used']['SALES']),
                    'MAKE': name[2],
                })

    # New/Used Ratio - calculated only for retail sales
    grouped_df = yr_df[yr_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['YEAR', 'FUEL', 'SEGMENT']):
            if len(sub_group) == 2:
                used = int(sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0])
                yr_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[2],
                    'Fuel Type': sub_name[1],
                    'Name': 'New/Used Ratio',
                    'Year': int(sub_name[0]),
                    'Value': (int(sub_group[sub_group['REG_TYPE']=='New']['SALES'].iloc[0]) / used) if used != 0 else 0,
                    'MAKE': name[2],
                })
                
            if len(sub_group) == 1:
                if 'New' in sub_group['REG_TYPE'].unique():
                    yr_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[2],
                        'Fuel Type': sub_name[1],
                        'Name': 'New/Used Ratio',
                        'Year': int(sub_name[0]),
                        'Value': None,
                        'MAKE': name[2],
                    })
                
                elif 'Used' in sub_group['REG_TYPE'].unique():
                    yr_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[2],
                        'Fuel Type': sub_name[1],
                        'Name': 'New/Used Ratio',
                        'Year': int(sub_name[0]),
                        'Value': 0,
                        'MAKE': name[2],
                    })

    print("calculated yearly data")

    # Monthly Transactions Calculations
    # ==========================================================================================
    print('calculating ttm trans')
    ttm_out = []
    ttm_df = filtered_df_ttm
    # RETAIL New | RETAIL Used
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for i, row in group.iterrows():
            ttm_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': row['SEGMENT'],
                'Fuel Type': row['FUEL'],
                'Name': row['REG_TYPE'],
                'Month': int(row['MONTH']),
                'Year': int(row['YEAR']),
                'Value': int(row['SALES']),
                'MAKE': name[2],
            })

    # Commercial (FLEET New + FLEET Used)
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='FLEET'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            ttm_out.append({
                'DEALER_NAME': name[0],
                'DEALER_ADDRESS_FULL': name[1],
                'Vehicle Type': sub_name[3],
                'Fuel Type': sub_name[2],
                'Name': 'Commercial',
                'Month': int(sub_name[0]),
                'Year': int(sub_name[1]),
                'Value': int(sub_group['SALES'].sum()),
                'MAKE': name[2],
            })

    # Wholesale (RETAIL Used + FLEET Used)
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='All'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            if 'Used' in sub_group['REG_TYPE'].unique():
                ttm_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[3],
                    'Fuel Type': sub_name[2],
                    'Name': 'Wholesale',
                    'Month': int(sub_name[0]),
                    'Year': int(sub_name[1]),
                    'Value': int(0.15 * sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0]),
                    'MAKE': name[2],
                })

    # New/Used Ratio - calculated only for retail sales
    grouped_df = ttm_df[ttm_df['REGISTRATION_TYPE']=='RETAIL'].groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'])
    for name, group in grouped_df:
        for sub_name, sub_group in group.groupby(['MONTH', 'YEAR', 'FUEL', 'SEGMENT']):
            if len(sub_group) == 2:
                used = int(sub_group[sub_group['REG_TYPE']=='Used']['SALES'].iloc[0])
                ttm_out.append({
                    'DEALER_NAME': name[0],
                    'DEALER_ADDRESS_FULL': name[1],
                    'Vehicle Type': sub_name[3],
                    'Fuel Type': sub_name[2],
                    'Name': 'New/Used Ratio',
                    'Month': int(sub_name[0]),
                    'Year': int(sub_name[1]),
                    'Value': (int(sub_group[sub_group['REG_TYPE']=='New']['SALES'].iloc[0]) / used) if used != 0 else 0,
                    'MAKE': name[2],
                })

            if len(sub_group) == 1:
                if 'New' in sub_group['REG_TYPE'].unique():
                    ttm_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[3],
                        'Fuel Type': sub_name[2],
                        'Name': 'New/Used Ratio',
                        'Month': int(sub_name[0]),
                        'Year': int(sub_name[1]),
                        'Value': None,
                        'MAKE': name[2],
                    })

                elif 'Used' in sub_group['REG_TYPE'].unique():
                    ttm_out.append({
                        'DEALER_NAME': name[0],
                        'DEALER_ADDRESS_FULL': name[1],
                        'Vehicle Type': sub_name[3],
                        'Fuel Type': sub_name[2],
                        'Name': 'New/Used Ratio',
                        'Month': int(sub_name[0]),
                        'Year': int(sub_name[1]),
                        'Value': 0,
                        'MAKE': name[2],
                    })
    print("calculated ttm data")

    yr_data = pd.DataFrame(yr_out)
    ttm_data = pd.DataFrame(ttm_out)

    yr_data = yr_data.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE']).apply(
        lambda x: x[['Year', 'Vehicle Type', 'Fuel Type', 'Name', 'Value']].to_dict('records')
    ).reset_index(name='Transactions')

    ttm_data = ttm_data.groupby(['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE']).apply(
        lambda x: x[['Month', 'Year', 'Vehicle Type', 'Fuel Type', 'Name', 'Value']].to_dict('records')
    ).reset_index(name='Transactions - TTM')
    
    print("len of yr_data:", len(yr_data))
    print("len of ttm_data:", len(ttm_data))

    data = pd.merge(yr_data, ttm_data, on=['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'], how='outer')

    return data

################################################################################################
# Imputation Functions
################################################################################################
def dealer_transactions_single_imputation(highlighted_dealer, start, nu_ratio):
    start_date = datetime.strptime(start, '%Y-%m-%d')
    flag_1 = 0
    flag_2 = 0

    if highlighted_dealer["Transactions"] is not None:
        tr = pd.DataFrame(list(highlighted_dealer["Transactions"]))

    if highlighted_dealer["Transactions - TTM"] is not None:
        tr_ttm = pd.DataFrame(list(highlighted_dealer["Transactions - TTM"]))

    if highlighted_dealer["Transactions"] is not None:
        data = tr[(tr["Vehicle Type"] == 'All') & (tr["Fuel Type"] == 'All Types') & (tr["Name"] == 'New/Used Ratio')]

        if data['Value'].mean() > 5 or pd.isna(data['Value'].mean()) or data['Value'].max() > 20 or data['Value'].min() < 0.5:
            flag_1 = 1
        else:
            flag_1 = 0

    if highlighted_dealer["Transactions - TTM"] is not None:
        data = tr_ttm[(tr_ttm["Vehicle Type"] == 'All') & (tr_ttm["Fuel Type"] == 'All Types') & (tr_ttm["Name"] == 'New/Used Ratio')]

        if data['Value'].mean() > 5 or pd.isna(data['Value'].mean()) or data['Value'].max() > 20 or data['Value'].min() < 0.5:
            flag_2 = 1
        else:
            flag_2 = 0

    # data = data[(data['Month'] >= start_date.month) & (data['Year'] >= start_date.year)]
    # if data['Value'].mean() > 5 or pd.isna(data['Value'].mean()) or data['Value'].max() > 30:
    #     flag = 1
    # else:
    #     flag = 0

    # imputing for annual transactions
    if flag_1 == 1:
        data_type='nu_ratio'
        ratio = nu_ratio[(nu_ratio['BRAND']==highlighted_dealer['MAKE']) & (nu_ratio['YEAR'].isin(tr['Year'])) & (nu_ratio['STATECODE']==highlighted_dealer['DEALER_STATE_ABBRV'])]
        def replace_name(group):
            if 'New' in group['Name'].unique() and 'Used' in group['Name'].unique():
                new_value = group.loc[group['Name'] == 'New', 'Value'].values[0]
                used_value = group.loc[group['Name'] == 'Used', 'Value'].values[0]
                group.loc[group['Name'] == 'New/Used Ratio', 'Value'] = new_value / used_value
            else: 
                group.loc[group['Name'] == 'New/Used Ratio', 'Value'] = None
            return group
        
        if not ratio.empty:
            for i, row in ratio.iterrows():
                tr.loc[((tr['Year']==row['YEAR']) & (tr['Name']=='New/Used Ratio')), 'Value'] = row['NEW_USED_RATIO']

        else:
            tr.loc[tr['Name']=='New/Used Ratio', 'Value'] = 1.25
        
        if highlighted_dealer["Transactions"] is not None:
            tr = tr[tr['Name'] != 'Used']
            dt = tr[tr['Name'] == 'New']
            for i, row in dt.iterrows():
                row['Value'] = row['Value'] / tr[((tr['Fuel Type']==row['Fuel Type']) & (tr['Year']==row['Year']) & (tr["Vehicle Type"]==row['Vehicle Type']) & (tr["Name"]=='New/Used Ratio'))]['Value'].iloc[0]
                dt.loc[i, 'Value'] = row['Value']
            dt['Value'] = dt['Value'].astype(int)
            dt['Name'] = 'Used'
            tr = pd.concat([tr, dt])
            tr = tr.groupby(['Fuel Type', 'Vehicle Type', 'Year']).apply(replace_name)

    # imputing for ttm transactions
    if flag_2 == 1:
        data_type='nu_ratio'
        ratio = nu_ratio[(nu_ratio['BRAND']==highlighted_dealer['MAKE']) & (nu_ratio['YEAR'].isin(tr_ttm['Year'])) & (nu_ratio['STATECODE']==highlighted_dealer['DEALER_STATE_ABBRV'])]

        def replace_name(group):
            if 'New' in group['Name'].unique() and 'Used' in group['Name'].unique():
                new_value = group.loc[group['Name'] == 'New', 'Value'].values[0]
                used_value = group.loc[group['Name'] == 'Used', 'Value'].values[0]
                group.loc[group['Name'] == 'New/Used Ratio', 'Value'] = new_value / used_value
            else: 
                group.loc[group['Name'] == 'New/Used Ratio', 'Value'] = None
            return group

        if not ratio.empty:
            for i, row in ratio.iterrows():
                tr_ttm.loc[((tr_ttm['Year']==row['YEAR']) & (tr_ttm['Name']=='New/Used Ratio')), 'Value'] = row['NEW_USED_RATIO']

        else:
            tr_ttm.loc[tr_ttm['Name']=='New/Used Ratio', 'Value'] = 1.25
        
        if highlighted_dealer["Transactions - TTM"] is not None:
            tr_ttm = tr_ttm[tr_ttm['Name'] != 'Used']
            dt = tr_ttm[tr_ttm['Name'] == 'New']
            for i, row in dt.iterrows():
                row['Value'] = row['Value'] / tr_ttm[((tr_ttm['Fuel Type']==row['Fuel Type']) & (tr_ttm['Year']==row['Year']) & (tr_ttm["Vehicle Type"]==row['Vehicle Type']) & (tr_ttm["Name"]=='New/Used Ratio'))]['Value'].iloc[0]
                dt.loc[i, 'Value'] = row['Value']
            dt['Value'] = dt['Value'].astype(int)
            dt['Name'] = 'Used'
            tr_ttm = pd.concat([tr_ttm, dt])
            tr_ttm = tr_ttm.groupby(['Fuel Type', 'Vehicle Type', 'Month', 'Year']).apply(replace_name)

    if highlighted_dealer["Transactions"] is None:   
        # return [], tr_ttm.to_dict(orient='records'), flag_1, flag_2
        return None, tr_ttm.to_dict(orient='records'), flag_1, flag_2
    elif highlighted_dealer["Transactions - TTM"] is None:   
        return tr.to_dict(orient='records'), None, flag_1, flag_2
    else:
        return tr.to_dict(orient='records'), tr_ttm.to_dict(orient='records'), flag_1, flag_2

################################################################################################
# Sanity checks
################################################################################################
def sanity_check_single(df):
    data1 = df.drop(['tr-imputed',	'ttm-imputed'], axis=1)
    for i, row in data1.iterrows():
        df_a = pd.DataFrame([row], columns=row.index)
        # if not df_a['Transactions'].iloc[0].shape == (0,):
        if df_a['Transactions'].notnull().any():
            exploded_df = df_a.explode('Transactions').reset_index(drop=True)

            profitability_df = pd.json_normalize(exploded_df['Transactions'])

            result_df = pd.concat([exploded_df.drop(columns=['Transactions']), profitability_df], axis=1)
            result_df = result_df.dropna(subset=['Year'])
            result_df['Year'] = result_df['Year'].astype(int)
            result_df = result_df[(result_df['Year'].isin([2019, 2020, 2021, 2022, 2023, 2024]))]

            df1 = result_df[(result_df['Name']=='New')]
            df1['Value'].replace([np.inf, -np.inf], np.nan, inplace=True)
            df1 = df1.dropna()
            df1 = df1[(df1['Fuel Type']=='All Types') & (df1['Vehicle Type']=='All') & (df1['Year']==2024)]

            if df_a['Transactions - TTM'].notnull().any():
                exploded_df = df_a.explode('Transactions - TTM').reset_index(drop=True)

                profitability_df = pd.json_normalize(exploded_df['Transactions - TTM'])

                result_df = pd.concat([exploded_df.drop(columns=['Transactions - TTM']), profitability_df], axis=1)
                result_df = result_df.dropna(subset=['Year'])
                result_df['Year'] = result_df['Year'].astype(int)
                result_df = result_df[(result_df['Year'].isin([2019, 2020, 2021, 2022, 2023, 2024]))]

                df2 = result_df[(result_df['Name']=='New')]
                df2['Value'].replace([np.inf, -np.inf], np.nan, inplace=True)
                df2 = df2.dropna()

                if not df2.empty:  # "Trans present for 2023, but no 'new' for 2024 (basically no df1), ttm present but no 'new' sales for any month, then flag it"
                    df2 = df2[(df2['Fuel Type']=='All Types') & (df2['Vehicle Type']=='All') & (df2['Year']==2024)]
                    df2 = df2.groupby(['DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY',
                        'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL', 'MAKE', 'Fuel Type',
                        'Name', 'Vehicle Type', 'Year']).sum('Value').reset_index()

                    if not (df1.empty or df2.empty):
                        diff = df1['Value'].iloc[0] - df2['Value'].iloc[0]
                        if diff == 0:
                            data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [0, diff]
                        elif diff >= -20 and diff <= 20:
                            data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [1, diff]
                        elif diff > 20 or diff < -20:
                            data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [4, diff]
                    elif df1.empty and df2.empty:
                        data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [0, diff]
                    elif df1.empty or df2.empty:
                        data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [3, diff]
                else:
                    data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [5, None]

            elif df1.empty:
                data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [5, None]  #  if no ttm and no trans for 2024, then no error and it is a flag as no 'new' sales in ttm
            else:
                data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [2, None]
        else:
                data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [None, None]

    return data1

def sanity_check_multi(df_multi, df_single):
    data1 = df_multi.drop(['imputed'], axis=1)
    data2 = df_single.drop(['Transactions', 'tr-imputed', 'ttm-imputed'], axis=1)

    exploded_df = data1.explode('Transactions').reset_index(drop=True)

    profitability_df = pd.json_normalize(exploded_df['Transactions'])

    result_df = pd.concat([exploded_df.drop(columns=['Transactions']), profitability_df], axis=1)
    result_df = result_df.dropna(subset=['Year'])
    result_df['Year'] = result_df['Year'].astype(int)
    result_df = result_df[(result_df['Year'].isin([2019, 2020, 2021, 2022, 2023, 2024]))]

    df1 = result_df[(result_df['Name']=='New')]
    df1['Value'].replace([np.inf, -np.inf], np.nan, inplace=True)
    df1 = df1.dropna()
    df1 = df1[(df1['Fuel Type']=='All Types') & (df1['Vehicle Type']=='All') & (df1['Year']==2024)]
    df1 = df1.groupby(['DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY',
        'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL',
        'Fuel Type', 'Name', 'Vehicle Type', 'Year']).sum('Value').reset_index()
    df1 = df1.rename(columns={'Value': 'annual'})

    exploded_df = data2.explode('Transactions - TTM').reset_index(drop=True)

    profitability_df = pd.json_normalize(exploded_df['Transactions - TTM'])

    result_df = pd.concat([exploded_df.drop(columns=['Transactions - TTM']), profitability_df], axis=1)
    result_df = result_df.dropna(subset=['Year'])
    result_df['Year'] = result_df['Year'].astype(int)
    result_df = result_df[(result_df['Year'].isin([2019, 2020, 2021, 2022, 2023, 2024]))]

    df2 = result_df[(result_df['Name']=='New')]
    df2['Value'].replace([np.inf, -np.inf], np.nan, inplace=True)
    df2 = df2.dropna()
    df2 = df2[(df2['Fuel Type']=='All Types') & (df2['Vehicle Type']=='All') & (df2['Year']==2024)]
    df2 = df2.groupby(['DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY',
    'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL', 'Fuel Type',
    'Name', 'Vehicle Type', 'Year']).sum('Value').reset_index().drop('Month', axis=1)
    df2 = df2.rename(columns={'Value': 'ttm'})

    df_merge = df1.merge(df2, on=['DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY',
    'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL', 'Fuel Type',
    'Name', 'Vehicle Type', 'Year'], how='outer')

    for i, row in df_merge.iterrows():
        if not (row['annual'] == np.nan or row['ttm'] == np.nan):
            diff = row['annual'] - row['ttm']
            if diff == 0:
                data2.loc[((data2['DEALER_NAME']==row['DEALER_NAME']) & (data2['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [0, diff]
            elif diff >= -20 and diff <= 20:
                data2.loc[((data2['DEALER_NAME']==row['DEALER_NAME']) & (data2['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [1, diff]
            elif diff > 20 or diff < -20:
                data2.loc[((data2['DEALER_NAME']==row['DEALER_NAME']) & (data2['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [4, diff]
        elif (row['annual'] == np.nan or row['ttm'] == np.nan):
            data2.loc[((data2['DEALER_NAME']==row['DEALER_NAME']) & (data2['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL'])), ['flag', 'diff']] = [3, None]

    data2['flag']= data2['flag'].fillna(0)
        
    return data2

def sanity_check_single_v2(df):
    data1 = df.drop(['tr-imputed',	'ttm-imputed'], axis=1)
    for i, row in data1.iterrows():
        df_a = pd.DataFrame([row], columns=row.index)
        # if not df_a['Transactions'].iloc[0].shape == (0,):
        if df_a['Transactions'].notnull().any():
            exploded_df = df_a.explode('Transactions').reset_index(drop=True)

            profitability_df = pd.json_normalize(exploded_df['Transactions'])

            result_df = pd.concat([exploded_df.drop(columns=['Transactions']), profitability_df], axis=1)
            result_df = result_df.dropna(subset=['Year'])
            result_df['Year'] = result_df['Year'].astype(int)
            result_df = result_df[(result_df['Year'].isin([2019, 2020, 2021, 2022, 2023, 2024]))]

            df1 = result_df[(result_df['Name']=='New')]
            df1['Value'].replace([np.inf, -np.inf], np.nan, inplace=True)
            df1 = df1.dropna()
            df1 = df1[(df1['Fuel Type']=='All Types') & (df1['Vehicle Type']=='All') & (df1['Year']==2024)]

            if df_a['Transactions - TTM'].notnull().any():
                exploded_df = df_a.explode('Transactions - TTM').reset_index(drop=True)

                profitability_df = pd.json_normalize(exploded_df['Transactions - TTM'])

                result_df = pd.concat([exploded_df.drop(columns=['Transactions - TTM']), profitability_df], axis=1)
                result_df = result_df.dropna(subset=['Year'])
                result_df['Year'] = result_df['Year'].astype(int)
                result_df = result_df[(result_df['Year'].isin([2019, 2020, 2021, 2022, 2023, 2024]))]

                df2 = result_df[(result_df['Name']=='New')]
                df2['Value'].replace([np.inf, -np.inf], np.nan, inplace=True)
                df2 = df2.dropna()

                if not df2.empty:  # "Trans present for 2023, but no 'new' for 2024 (basically no df1), ttm present but no 'new' sales for any month, then flag it"
                    df2 = df2[(df2['Fuel Type']=='All Types') & (df2['Vehicle Type']=='All') & (df2['Year']==2024)]
                    df2 = df2.groupby(['DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY',
                        'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL', 'MAKE', 'Fuel Type',
                        'Name', 'Vehicle Type', 'Year']).sum('Value').reset_index()

                    if not (df1.empty or df2.empty):
                        diff = df1['Value'].iloc[0] - df2['Value'].iloc[0]
                        if diff == 0:
                            data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [0, diff]
                        elif diff >= -20 and diff <= 20:
                            data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [1, diff]
                        elif diff > 20 or diff < -20:
                            data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [4, diff]
                    elif df1.empty and df2.empty:
                        data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [0, diff]
                    elif df1.empty or df2.empty:
                        data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [3, diff]
                else:
                    data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [5, None]

            elif df1.empty:
                data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [5, None]  #  if no ttm and no trans for 2024, then no error and it is a flag as no 'new' sales in ttm
            else:
                data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [2, None]
        else:
                data1.loc[((data1['DEALER_NAME']==row['DEALER_NAME']) & (data1['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data1['MAKE']==row['MAKE'])), ['flag', 'diff']] = [None, None]

    return data1

def sanity_check_multi_v2(df_multi, df_single):
    data1 = df_multi.drop(['imputed'], axis=1)
    data2 = df_single.drop(['Transactions', 'tr-imputed', 'ttm-imputed'], axis=1)

    exploded_df = data1.explode('Transactions').reset_index(drop=True)

    profitability_df = pd.json_normalize(exploded_df['Transactions'])

    result_df = pd.concat([exploded_df.drop(columns=['Transactions']), profitability_df], axis=1)
    result_df = result_df.dropna(subset=['Year'])
    result_df['Year'] = result_df['Year'].astype(int)
    result_df = result_df[(result_df['Year'].isin([2019, 2020, 2021, 2022, 2023, 2024]))]

    df1 = result_df[(result_df['Name']=='New')]
    df1['Value'].replace([np.inf, -np.inf], np.nan, inplace=True)
    df1 = df1.dropna()
    df1 = df1[(df1['Fuel Type']=='All Types') & (df1['Vehicle Type']=='All') & (df1['Year']==2024)]
    df1 = df1.groupby(['DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY',
        'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL',
        'Fuel Type', 'Name', 'Vehicle Type', 'Year', 'MAKE']).sum('Value').reset_index()
    df1 = df1.rename(columns={'Value': 'annual'})

    exploded_df = data2.explode('Transactions - TTM').reset_index(drop=True)

    profitability_df = pd.json_normalize(exploded_df['Transactions - TTM'])

    result_df = pd.concat([exploded_df.drop(columns=['Transactions - TTM']), profitability_df], axis=1)
    result_df = result_df.dropna(subset=['Year'])
    result_df['Year'] = result_df['Year'].astype(int)
    result_df = result_df[(result_df['Year'].isin([2019, 2020, 2021, 2022, 2023, 2024]))]

    df2 = result_df[(result_df['Name']=='New')]
    df2['Value'].replace([np.inf, -np.inf], np.nan, inplace=True)
    df2 = df2.dropna()
    df2 = df2[(df2['Fuel Type']=='All Types') & (df2['Vehicle Type']=='All') & (df2['Year']==2024)]
    df2 = df2.groupby(['DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY',
    'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL', 'Fuel Type',
    'Name', 'Vehicle Type', 'Year', 'MAKE']).sum('Value').reset_index().drop('Month', axis=1)
    df2 = df2.rename(columns={'Value': 'ttm'})

    df_merge = df1.merge(df2, on=['DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY',
    'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL', 'Fuel Type',
    'Name', 'Vehicle Type', 'Year', 'MAKE'], how='outer')

    for i, row in df_merge.iterrows():
        if not (row['annual'] == np.nan or row['ttm'] == np.nan):
            diff = row['annual'] - row['ttm']
            if diff == 0:
                data2.loc[((data2['DEALER_NAME']==row['DEALER_NAME']) & (data2['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data2['MAKE']==row['MAKE'])), ['flag', 'diff']] = [0, diff]
            elif diff >= -20 and diff <= 20:
                data2.loc[((data2['DEALER_NAME']==row['DEALER_NAME']) & (data2['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data2['MAKE']==row['MAKE'])), ['flag', 'diff']] = [1, diff]
            elif diff > 20 or diff < -20:
                data2.loc[((data2['DEALER_NAME']==row['DEALER_NAME']) & (data2['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data2['MAKE']==row['MAKE'])), ['flag', 'diff']] = [4, diff]
        elif (row['annual'] == np.nan or row['ttm'] == np.nan):
            data2.loc[((data2['DEALER_NAME']==row['DEALER_NAME']) & (data2['DEALER_ADDRESS_FULL']==row['DEALER_ADDRESS_FULL']) & (data2['MAKE']==row['MAKE'])), ['flag', 'diff']] = [3, None]

    data2['flag']= data2['flag'].fillna(0)
        
    return data2



In [0]:

calc_type = 'transactions'
print(f'Calculating {calc_type}...')

all_dealers = pd.read_parquet(f'/dbfs/jump-datasets/pipeline_data/base_files/dealers_{start_date.strftime("%y%m")}_{end_date.strftime("%y%m")}_make_sales.parquet')

filepath = f'dbfs:/jump-datasets/datasets/polk_2019-01-01_{end}.parquet'
polk_df = spark.read.parquet(filepath)
polk_data = polk_df.filter(F.col('REPORT_YEAR_MONTH') <= end)

reassign_df_all = pd.read_parquet(f'/dbfs/jump-datasets/pipeline_data/base_files/multi_brand_dealer_sales_readjusted_{end_date.strftime("%y%m")}.parquet').sort_values(['DEALER_NAME', 'YEAR'])

# fill np.nan values in 'NEW_SALES_FINAL' and 'USED_SALES_FINAL_ADJUSTED' with 'NEW_SALES' and 'USED_SALES' of the respective rows
reassign_df_all = reassign_df_all.progress_apply(reassign_fillna, axis=1)
reassign_df_all = reassign_df_all.progress_apply(check_sales, axis=1)

start_time = perf_counter()
for state_id in range(0, 9):
    savepath = f'/dbfs/jump-datasets/pipeline_data/transactions/dealer_{calc_type}_{end_date.strftime("%y%m")}_{state_names[state_id]}_v5_single_annual_and_ttm_both_reassigned.parquet'

    dealers = all_dealers[all_dealers['DEALER_STATE_ABBRV'].isin(states[state_id])].reset_index(drop=True)
    print(f'Dealers: {len(dealers):,}')
    print(f"{len(dealers):,} - {list(dealers['DEALER_STATE_ABBRV'].sort_values().unique())}")

    polk_dataframe = polk_data.filter(F.col('DEALER_STATE_ABBRV').isin(states[state_id]))

    if calc_type == 'transactions':
        reassign_df = reassign_df_all[reassign_df_all['STATE'].isin(states[state_id])].reset_index(drop=True)
        out = dealer_transactions_single_brand_v5_new(spark.createDataFrame(dealers), polk_dataframe, reassign_df, end)
        out = pd.merge(out, dealers.rename(columns={'MAKE': 'PRIMARY_MAKE'}), on=['DEALER_NAME', 'DEALER_ADDRESS_FULL'])
        out = out[[
            'DEALER_NAME', 'DEALER_ADDRESS', 'DEALER_TOWN_NAME', 'DEALER_COUNTY', 'DEALER_STATE_ABBRV', 'DEALER_ZIP', 'DEALER_ADDRESS_FULL', 'MAKE', 
            'Transactions', 'Transactions - TTM'
        ]]

    out.to_parquet(savepath, index=False)
    print(f'Saved: {savepath}')
    spark.catalog.clearCache()

end_time = perf_counter()

print(f"Parallel execution took {end_time - start_time:.4f} seconds")



In [0]:

calc_type = 'transactions'

nu_ratio = pd.read_parquet(f'/dbfs/jump-datasets/pipeline_data/base_files/nu_ratio_state_brand_year_{end_date.strftime("%y%m")}.parquet')
results = []
for state_id in range(0, 9):
    savepath = f'/dbfs/jump-datasets/pipeline_data/transactions/dealer_{calc_type}_{end_date.strftime("%y%m")}_{state_names[state_id]}_v5_single_annual_and_ttm_both_reassigned_imputed.parquet'
    data = pd.read_parquet(f'/dbfs/jump-datasets/pipeline_data/transactions/dealer_{calc_type}_{end_date.strftime("%y%m")}_{state_names[state_id]}_v5_single_annual_and_ttm_both_reassigned.parquet')

    print(f"{state_names[state_id]}", "-----", len(data))
    out = []
    for i, row in tqdm(data.iterrows(), total=len(data), desc='Processing'):
        highlighted_dealer = row.to_dict()
        try:
            highlighted_dealer['Transactions'], highlighted_dealer['Transactions - TTM'], highlighted_dealer['tr-imputed'], highlighted_dealer['ttm-imputed'] = dealer_transactions_single_imputation(highlighted_dealer, past_3_months, nu_ratio)
        except Exception as e:
            print(e)
            print(f"Error processing row {i}: {e}")
            highlighted_dealer['tr-imputed'] = 0
            highlighted_dealer['ttm-imputed'] = 0

        out.append(highlighted_dealer)
    result =  pd.DataFrame(out)
    results.append(result)
    result.to_parquet(savepath, index=False)
    print("saved: ", savepath)



In [0]:

# -----------------------------------------------------------------------------
# sanity check for single brand dealers

data_sanity_single_all = pd.DataFrame()
calc_type = 'transactions'
for state_id in range(0, 9):
    bad_dealers_list = bad_dealers[state_id]
    data = pd.read_parquet(f'/dbfs/jump-datasets/pipeline_data/transactions/dealer_{calc_type}_{end_date.strftime("%y%m")}_{state_names[state_id]}_v5_single_annual_and_ttm_both_reassigned_imputed.parquet')
    data['MAKE'] = data['MAKE'].replace(brand_rename_dict)
    data = data[~data['DEALER_ADDRESS_FULL'].isin(bad_dealers_list)]

    result_single = sanity_check_single_v2(data)
    data_sanity_single_all = pd.concat([data_sanity_single_all, result_single], axis=0)

data_sanity_single = data_sanity_single_all[data_sanity_single_all['flag'].isin([np.nan, 2, 3, 4])]

# ------------------------------------------------------------------------------
# sanity check for multi brand dealers

data_sanity_multi_all = pd.DataFrame()
dealers_error = pd.DataFrame()

for state_id in range(0, 9):
    bad_dealers_list = bad_dealers[state_id]

    data_m = pd.read_parquet(f'/dbfs/jump-datasets/pipeline_data/transactions/dealer_{calc_type}_{end_date.strftime("%y%m")}_{state_names[state_id]}_v3_multi_annual_reassigned_imputed.parquet')
    data_m['MAKE'] = data_m['MAKE'].replace(brand_rename_dict)
    data_m = data_m[~data_m['DEALER_ADDRESS_FULL'].isin(bad_dealers_list)]

    data_s = pd.read_parquet(f'/dbfs/jump-datasets/pipeline_data/transactions/dealer_{calc_type}_{end_date.strftime("%y%m")}_{state_names[state_id]}_v5_single_annual_and_ttm_both_reassigned_imputed.parquet')
    data_s['MAKE'] = data_s['MAKE'].replace(brand_rename_dict)
    data_s = data_s[~data_s['DEALER_ADDRESS_FULL'].isin(bad_dealers_list)]

    data_s = data_s.merge(data_m[['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE']].drop_duplicates(subset=['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE']), on=['DEALER_NAME', 'DEALER_ADDRESS_FULL', 'MAKE'], how='right').dropna(subset=['DEALER_ADDRESS'])
    for i, row in data_s.iterrows():
        df_a = pd.DataFrame([row], columns=row.index)
        if df_a[(df_a['DEALER_NAME'].isin(data_sanity_single['DEALER_NAME'])) & (df_a['DEALER_ADDRESS_FULL'].isin(data_sanity_single['DEALER_ADDRESS_FULL'])) & (df_a['MAKE'].isin(data_sanity_single['MAKE']))].empty:
            dealers_error = pd.concat([dealers_error, df_a])
        
    result_multi = sanity_check_multi_v2(data_m, data_s)
    data_sanity_multi_all = pd.concat([data_sanity_multi_all, result_multi], axis=0)

data_sanity_multi = data_sanity_multi_all[data_sanity_multi_all['flag'].isin([np.nan, 2, 3, 4])]
dealers_error = pd.concat([dealers_error, data_sanity_multi], axis=0)

# # ----------------------------------------------------------------------------
# saving the final list of all the dealers that need to be checked again

dealers_error.to_parquet(f'/dbfs/jump-datasets/pipeline_data/sanity_checks/transactions/all_dealers_transactions_error_v2_{end_date.strftime("%y%m")}.parquet')



In [0]:
dealers_error