In [1]:
import pandas as pd
import numpy as np           
import plotly.express as px
import os

In [2]:
# Temporary storage for grouped objects by year
groups=[]
# Temporary storage for grouped objects by regions
subgroups_reg=[]
# Temporary storage for grouped objects by embassies
subgroups_emb=[]
# function to count real cases
def cases(s):
    return sum(s>=1)

# function to count holes
def holes(s):
    return sum(s.isna())
# Aggregation dictionary for area plot totals. Excluded ,'InTransit':[cases],'Transfer':[cases],
agg_dict = {'status':[holes],            
            'Issued':[cases],
            'Refused':[cases],
            'Refused221g':[cases],
            'Ready':[cases],
            'NVC':[cases],
            'AP':[cases],
           }
#Restructure dataframe
def restructure_df(df):
    df.reset_index(inplace=True)
    df.columns=['year','region','consulate','Case_ranges','Hole_C','Iss_C','Ref_C','221g_C','Rdy_C','NVC_C','AP_C']
    return df

In [None]:
#Loop through cleaned pkl files, group case numbers by region, and group by year, aggregating at each intermediary stage
for entry in os.scandir("ceac_pkl"):
    if entry.name.endswith('.pkl') and not entry.name.startswith('.') and entry.is_file():
        ceac=pd.read_pickle(entry.path)
        ceac.insert(0,'year',entry.name[-8:-4])
        region_list=ceac['region'].dropna().unique()
        for region in region_list:
            in_region=ceac['region']==region
            sub_ceac_reg=ceac[in_region]
            #get embassy list
            emb_list=sub_ceac_reg['consulate'].dropna().unique()
            #loop over embassies
            for emb in emb_list:
                in_emb=sub_ceac_reg['consulate']==emb
                sub_ceac_emb=sub_ceac_reg[in_emb]
                # select and cut case numbers into bins
                case_num=sub_ceac_emb['caseNumber']
                maxcase_emb=round(case_num.max(),-3)
                num_bins_emb=round(maxcase_emb/1000)
                if num_bins_emb<1:
                    num_bins_emb=1
                case_ranges_emb=pd.cut(case_num,num_bins_emb,precision=0)
                sub_ceac_emb.insert(1,'Case_ranges',case_ranges_emb)
                subgroups_emb.append(sub_ceac_emb)
            #append all embassies for a given region
            subgroups_emb_concat=pd.concat(subgroups_emb)
            #add the appended emabssies to a region list
            subgroups_reg.append(subgroups_emb_concat)
        #append all regions
        subgroups_reg_concat=pd.concat(subgroups_reg)
        grouped=subgroups_reg_concat.groupby(['year','region','consulate','Case_ranges'],observed=True).agg(agg_dict)
        grouped.dropna(inplace=True)
        groups.append(restructure_df(grouped))

In [None]:
# Check sample,
groups[-1]

In [None]:
#Concatenate all the years
ceac_concat=pd.concat(groups)
ceac_concat.head(10)

In [None]:
#change Casee_ranges to str for plotting
ceac_concat['Case_ranges']=ceac_concat['Case_ranges'].astype('str') 

In [None]:
#Select embassy and year, and plot data
in_year_and_region_and_emb=(ceac_concat['year']=="2024") & (ceac_concat['region']=="AF")& (ceac_concat['consulate']=="ACC")
area_data=ceac_concat[in_year_and_region_and_emb]
area_data

In [None]:
fig_area_emb=px.area(area_data, x=area_data.Case_ranges, y=area_data.columns[4:-1])
fig_area_emb.show()

In [None]:
#save to pickle
directory='ceac_pkl/aggregations/'
if not os.path.exists(directory):
    os.makedirs(directory)
    ceac_concat.to_pickle(directory+'consulate_case_ranges_input.pkl')
else:
    ceac_concat.to_pickle(directory+'consulate_case_ranges_input.pkl')