## Kernel to load: vax_inc_general

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import binomtest
import pycountry
from functools import reduce
import random
import os
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
notebook_dir = os.path.dirname(os.getcwd())
source_data_path=os.path.join(notebook_dir, "Common Source Data")

In [3]:
df_start= pd.read_csv(os.path.join(source_data_path, 'swine/AllRegions_swine_MoreVaccinationData.csv'))
df_start=df_start[df_start['Semester']!='Jul-Dec 2024']
df_start

Unnamed: 0,Year,Semester,Region,Country,Disease,Animal category,Species,Vaccine type,Number of vaccinated
0,2005,Jul-Dec 2005,Asia,Afghanistan,Foot and mouth disease virus (Inf. with),Domestic,Swine,-,-
1,2005,Jul-Dec 2005,Asia,Armenia,Anthrax,Domestic,Swine,-,26613
2,2005,Jul-Dec 2005,Asia,Armenia,Classical swine fever virus (Inf. with),Domestic,Swine,-,262994
3,2005,Jul-Dec 2005,Asia,Armenia,African swine fever virus (Inf. with),Domestic,Swine,-,-
4,2005,Jul-Dec 2005,Asia,Armenia,Rabies virus (Inf. with),Domestic,Swine,-,-
...,...,...,...,...,...,...,...,...,...
4453,2024,Jan-Jun 2024,Asia,Cambodia,Classical swine fever virus (Inf. with),Domestic,Swine,-,-
4454,2024,Jan-Jun 2024,Asia,Cambodia,Foot and mouth disease virus (Inf. with),Domestic,Swine,-,-
4455,2024,Jan-Jun 2024,Europe,Latvia,Porcine reproductive and respiratory syndrome ...,Domestic,Swine,-,-
4456,2024,Jan-Jun 2024,Africa,Mozambique,Brucella abortus (Inf. with),Domestic,Swine,-,-


In [4]:
# List of diseases where no vaccine exists
diseases_no_vaccine = [
    "African cattle fever virus (Inf. with)",
    "Avian tuberculosis (-2005)",
    "Bovine spongiform encephalopathy",
    "Crimean Congo haemorrhagic fever (2006-)",
    "Maedi-visna",
    "Malignant catarrhal fever (wildebeest only)(2006-2008)",
    "New world screwworm (Cochliomyia hominivorax)",
    "Nipah virus encephalitis",
    "Scrapie",
    "Surra (Trypanosoma evansi)",
    "Theileria equi and Babesia caballi (Inf. with) (Equine piroplasmosis)",
    "Tularemia"
]


In [5]:
records_edited = 0
diseases_found = []
removed_records = []  

# Iterate through diseases and remove invalid records
for disease in diseases_no_vaccine:
    if disease in df_start['Disease'].unique():
        diseases_found.append(disease)  
        condition = (df_start['Disease'] == disease)
        
        disease_removed_records = df_start[condition]
        removed_records.append(disease_removed_records)
        
        count_disease_edits = disease_removed_records.shape[0]
        records_edited += count_disease_edits
        
        df_start = df_start[~condition]

print(f"Total records edited (removed): {records_edited}")

if diseases_found:
    print("\nDiseases with removed vaccination records:")
    for d in diseases_found:
        print(f" - {d}")
else:
    print("\nNone of the specified diseases were found in the dataframe.")

if removed_records:
    print("\nDetails of removed records:")
    for i, records in enumerate(removed_records):
        print(f"\nRemoved records for disease: {diseases_found[i]}")
        print(records)
else:
    print("\nNo records were removed.")

df_start.reset_index(drop=True, inplace=True)
print("\nFinal dataframe shape:", df_start.shape)


Total records edited (removed): 0

None of the specified diseases were found in the dataframe.

No records were removed.

Final dataframe shape: (4458, 9)


In [6]:
df_part_full=pd.read_csv('2005-2024_part_swine_vaccine_coverage_by_country.csv')

In [7]:
dict_dates = dict({
                  'Jan-Jun 2005':'2005-06-30','Jul-Dec 2005':'2005-12-31',
                  'Jan-Jun 2006':'2006-06-30','Jul-Dec 2006':'2006-12-31',
                  'Jan-Jun 2007':'2007-06-30','Jul-Dec 2007':'2007-12-31',
                  'Jan-Jun 2008':'2008-06-30','Jul-Dec 2008':'2008-12-31',
                  'Jan-Jun 2009':'2009-06-30','Jul-Dec 2009':'2009-12-31',
                  'Jan-Jun 2010':'2010-06-30','Jul-Dec 2010':'2010-12-31',
                  'Jan-Jun 2011':'2011-06-30','Jul-Dec 2011':'2011-12-31',
                  'Jan-Jun 2012':'2012-06-30','Jul-Dec 2012':'2012-12-31',
                  'Jan-Jun 2013':'2013-06-30','Jul-Dec 2013':'2013-12-31',
                  'Jan-Jun 2014':'2014-06-30','Jul-Dec 2014':'2014-12-31',
                  'Jan-Jun 2015':'2015-06-30','Jul-Dec 2015':'2015-12-31',
                  'Jan-Jun 2016':'2016-06-30','Jul-Dec 2016':'2016-12-31',
                  'Jan-Jun 2017':'2017-06-30','Jul-Dec 2017':'2017-12-31',
                  'Jan-Jun 2018':'2018-06-30','Jul-Dec 2018':'2018-12-31',
                  'Jan-Jun 2019':'2019-06-30','Jul-Dec 2019':'2019-12-31',
                  'Jan-Jun 2020':'2020-06-30','Jul-Dec 2020':'2020-12-31',
                  'Jan-Jun 2021':'2021-06-30','Jul-Dec 2021':'2021-12-31',
                  'Jan-Jun 2022':'2022-06-30','Jul-Dec 2022':'2022-12-31',
                  'Jan-Jun 2023':'2023-06-30','Jul-Dec 2023':'2023-12-31',
                  'Jan-Jun 2024':'2024-06-30'

                                                                        })
df_start=df_start.copy()

df_start['time']=[dict_dates[i] for i in df_start['Semester']]
df_start['Semester']=['June' if '06-30' in time else 'December' for time in df_start['time'].values]
df_start['time'] = pd.to_datetime(df_start['time'])

In [8]:
df_start['Number of vaccinated']=[float(i) if i!='-' else None for i in df_start['Number of vaccinated']]

df_start=df_start[df_start['Number of vaccinated']>=0]
df_start['Year Range']=df_start['Year']

In [9]:
start_countries=df_start['Country']

countries = {}
for country in pycountry.countries:
    countries[country.name] = country.alpha_3
    countries['USA']='USA'
    countries['UK']='GBR'
    countries['Taiwan']='TWN'
    countries['South Korea']='KOR'
    countries['Czech Republic']='CZE'
    countries['Brunei']='BRN'
    countries['Russia']='RUS'
    countries['Iran']='IRN'
    countries['United States of America']='USA'
    countries['Venezuela']='VEN'
    countries['China (Hong Kong SAR)']='HKG'
    countries["Cote d'Ivoire"]='CIV'
    countries['DR Congo']='COD'
    countries['Guinea Bissau']='GNB'
    countries['Lao PDR']='LAO'
    countries['Micronesia (Federated States of)']='FSM'
    countries['North Korea']='PRK'
    countries['Occupied Palestinian Territory']='PSE'
    countries['Swaziland']='SWZ'
    countries['Tanzania']='TZA'
    countries['Bolivia']='BOL'
    countries['Macedonia (TFYR)']='MKD'
    countries['Moldova']='MDA'
    countries['Bolivia (Plurinational State of)']='BOL'
    countries['China, Hong Kong SAR']='HKG'
    countries['China, Taiwan Province of']='TWN'
    countries['China, mainland']='CHN'
    countries['Czechoslovakia']='CSK'
    countries["Democratic People's Republic of Korea"]='PRK'
    countries['Democratic Republic of the Congo']='COD'
    countries['French Guyana']='GUF'
    countries['Micronesia']='FSM'
    countries['Palestine']='PSE'
    countries['Polynesia']='PYF'
    countries['Republic of Korea']='KOR'
    countries['Serbia and Montenegro']='SCG'
    countries['Sudan (former)']='SDN'
    countries['Türkiye']='TUR'
    countries['USSR']='SUN'
    countries['Iran (Islamic Republic of)']='IRN'
    countries['Republic of Moldova']='MDA'
    countries['United Kingdom of Great Britain and Northern Ireland']='GBR'
    countries['United Republic of Tanzania']='TZA'
    countries['Venezuela (Bolivarian Republic of)']='VEN'
    countries['Yugoslav SFR']='YUG'
    countries['Ethiopia PDR']='ETH'
    countries['Central African (Rep.)']='CAF'
    countries["China (People's Rep. of)"]='CHN'
    countries['Chinese Taipei']='TWN'
    countries['Congo (Dem. Rep. of the)']='COD'
    countries['Congo (Rep. of the)']='COG'
    countries["Cote D'Ivoire"]='CIV'
    countries['Dominican (Rep.)']='DOM'
    countries["Korea (Dem People's Rep. of)"]='PRK'
    countries['Korea (Rep. of)']='KOR'
    countries['Laos']='LAO'
    countries['South Sudan (Rep. of)']='SSD'
    countries['Syria']='SYR'
    countries['St. Vincent and the Grenadines']='VCT'
    countries['Vietnam']='VNM'
    countries['Reunion']='REU'
    countries['Guadaloupe']='GLP'
    countries['China, Macao SAR']='MAC'
    countries['Netherlands (Kingdom of the)']='NLD'
    countries['Türkiye (Rep. of)']='TUR'
    countries['Belgium-Luxembourg']='BLX'
    countries['Faeroe Islands']='FRO'
    countries['Cabo verde']='CPV'
    countries['St. Helena']='SHN'

    

codes_start=[countries.get(country, 'Unknown code:'+country) for country in start_countries]

for code in codes_start:
    if "Unknown" in code:
        print("FIX THIS:",code)


iso3s_start=[]

for i in start_countries:
    try:
        iso3s_start+=[countries[i]]
    except:
        iso3s_start+=[None]
        
df_start['ISO3']=iso3s_start

In [10]:
 # Aggregate rows by 'Country', 'Disease', and 'Year' to sum 'Number of vaccinated'
df_start = df_start.groupby(['Year', 'Country', 'Disease'], as_index=False).agg({
    'ISO3': 'first',
    'Country': 'first',
    'Year': 'first',
    'Region': 'first',
    'Disease': 'first',
    'Animal category': 'first',
    'Species': 'first',
    'Vaccine type': 'first',
    'Number of vaccinated': 'sum',  # Aggregate by summing
    'time': 'first',  # Keep the first occurrence (adjust if needed)
})


pop_swine_df = pd.read_csv(os.path.join(source_data_path, 'swine','swine_pop_2024.csv')).loc[:,['Area','Unit','Value','Year','Item','ISO3']]
pop_swine_df = pop_swine_df.sort_values('Value').drop_duplicates(subset=['ISO3','Year','Item'], keep='last')
pop_swine_df.rename(columns={'Value':'TOTAL Population'},inplace=True)

pop_swine_df = (
    pop_swine_df.groupby(['Area', 'Year','ISO3'], as_index=False)
    .agg({
        'ISO3':'first',
        'Area': 'first',
        'Year': 'first',
        'Unit': 'first',
        'Item': 'first',
        'TOTAL Population': 'sum',  # Aggregate the 'Value' by summing
    })
)
pop_swine_df.drop(columns=['Item'],inplace=True)


killed_pop_swine_df = pd.read_csv(os.path.join(source_data_path, 'swine','killed_swine_pop_2024.csv')).loc[:,['Area','Unit','Value','Year','Item','ISO3']]
killed_pop_swine_df = killed_pop_swine_df.sort_values('Value').drop_duplicates(subset=['ISO3','Year','Item'], keep='last')
killed_pop_swine_df.rename(columns={'Value':'TOTAL Slaughtered Population'},inplace=True)
killed_pop_swine_df = (
    killed_pop_swine_df.groupby(['Area', 'Year','ISO3'], as_index=False)
    .agg({
        'ISO3':'first',
        'Area': 'first',
        'Year': 'first',
        'Unit': 'first',
        'Item': 'first',
        'TOTAL Slaughtered Population': 'sum',  # Aggregate the 'Value' by summing
    })
)


pop_swine_df = pop_swine_df.sort_values('TOTAL Population').drop_duplicates(subset=['ISO3','Year'], keep='last')
killed_pop_swine_df = killed_pop_swine_df.sort_values('TOTAL Slaughtered Population').drop_duplicates(subset=['ISO3','Year'], keep='last')


df_start=reduce(lambda  left,right: pd.merge(left,right,on=['ISO3','Year'],
                                                how='left'), [df_start,
                                                             pop_swine_df.drop(columns=['Unit','Area']),
                                                              killed_pop_swine_df.drop(columns=['Unit','Area','Item'])])

df_start['Vaccine Coverage Intermediate']=df_start['Number of vaccinated']/(df_start['TOTAL Population']+df_start['TOTAL Slaughtered Population'])


In [None]:
# Add missing years for each unique combination of Semester, ISO3, Administrative Division, and Disease
def add_missing_years(df):
    group_columns = ["ISO3", "Disease"]

    unique_combinations = df[group_columns].drop_duplicates()
    unique_combinations['Min_Year'] = df.groupby(group_columns)['Year'].transform('min')
    unique_combinations['Max_Year'] = df.groupby(group_columns)['Year'].transform('max')

    all_years = []
    for _, row in unique_combinations.iterrows():
        years = pd.DataFrame({'Year': range(row['Min_Year'], row['Max_Year'] + 1)})
        for col in group_columns:
            years[col] = row[col]
        all_years.append(years)

    all_years_df = pd.concat(all_years, ignore_index=True)

    expanded_df = pd.merge(all_years_df, df, on=group_columns + ['Year'], how='left')

    none_columns = ["Vaccine Coverage Intermediate","Number of vaccinated"]
    for col in none_columns:
        expanded_df[col] = expanded_df[col].where(expanded_df[col].notna(), None)

    ffill_columns = expanded_df.columns.difference(none_columns + ['Year'])

    expanded_df[ffill_columns] = expanded_df.sort_values(by=group_columns + ['Year'])[ffill_columns].ffill()

    expanded_df['Derived_Vaccinated_Method'] = "None"

    return expanded_df




# Interpolate Adjusted_Susceptible and Vaccinated
def interpolate_adjusted_and_cases(df):
    group_columns = ["ISO3", "Disease"]

    def interpolate_group(group):
        group = group.sort_values('Year').reset_index(drop=True)

        interpolated_vaccinated = group['Vaccine Coverage Intermediate'].fillna(value=np.nan).interpolate(method='linear')

        group.loc[interpolated_vaccinated.notna() & group['Vaccine Coverage Intermediate'].isna(), 'Derived_Vaccinated_Method'] = "Adjusted_Vaccinated"

        group['Vaccine Coverage Intermediate'] = interpolated_vaccinated

        group['Number of vaccinated']=group['Vaccine Coverage Intermediate'] *(group['TOTAL Population']+group['TOTAL Slaughtered Population'])

        return group

    return df.groupby(group_columns, group_keys=False).apply(interpolate_group)

# Function to collect years data was used for interpolation
def update_interpolated_upper_year(df):
    group_columns = ["ISO3",  "Disease"]

    def assign_upper_year(group):
        group = group.sort_values('Year').reset_index(drop=True)

        for idx in group[group['Derived_Vaccinated_Method'].notna()].index:
            if group.loc[idx, 'Derived_Vaccinated_Method'] in ["Adjusted_Vaccinated"]:
                # Find the next original (non-interpolated) row by year
                upper_idx = group[(group.index > idx) & (group['Derived_Vaccinated_Method']=='None')].index.min()
                lower_idx = group[(group.index < idx) & (group['Derived_Vaccinated_Method']=='None')].index.max()

                
                if pd.notna(upper_idx):  
                    group.loc[idx, 'interpolated_upper_year'] = int(group.loc[upper_idx, 'Year'])
                else:
                    group.loc[idx, 'interpolated_upper_year'] = None  

                if pd.notna(lower_idx): 
                    group.loc[idx, 'interpolated_lower_year'] = int(group.loc[lower_idx, 'Year'])
                else:
                    group.loc[idx, 'interpolated_lower_year'] = None 
        return group

    return df.groupby(group_columns, group_keys=False).apply(assign_upper_year)


def process_dataframe(df):
    df = add_missing_years(df) 

    df.drop(columns=['TOTAL Population','TOTAL Slaughtered Population'],inplace=True)


    df=reduce(lambda  left,right: pd.merge(left,right,on=['ISO3','Year'],
                                                how='left'), [df,
                                                             pop_swine_df.drop(columns=['Unit','Area']),
                                                              killed_pop_swine_df.drop(columns=['Unit','Area','Item'])])

    df = interpolate_adjusted_and_cases(df)  # Interpolate Adjusted_Susceptible and Incidence, update Cases
    df=update_interpolated_upper_year(df)
    return df

df_result = process_dataframe(df_start)


In [12]:
def generate_year_range(row):
    if pd.notna(row['interpolated_lower_year']) and pd.notna(row['interpolated_upper_year']):
        return f"{int(row['interpolated_lower_year'])}-{int(row['interpolated_upper_year'])}"
    else:
        return str(int(row['Year']))

df_result['Year Range'] = df_result.apply(generate_year_range, axis=1)


In [13]:
final_dfs=[]

for year in range(2005,2025):
    
    df2=df_result.copy()
    df2=df2[df2['Year']<=year]
    
    df_sorted = df2.sort_values(['Country', 'Disease', 'Year'], ascending=[True, True, False])

    # Identify the latest available year for each 'Country' and 'Disease'
    latest_years = df_sorted.groupby(['Country', 'Disease'], as_index=False)['Year'].max()
    
    # Filter rows corresponding to the latest year for each 'Country' and 'Disease'
    df_latest = df_sorted.merge(latest_years, on=['Country', 'Disease', 'Year'])

    cols_keep=['ISO3','Country','Year','Region','Disease','Animal category','Species','Vaccine type','Number of vaccinated','time',
               'Year Range','TOTAL Population','TOTAL Slaughtered Population','Derived_Vaccinated_Method']
    
    df_coverage=df_latest.loc[:,cols_keep]
    df_coverage.reset_index(drop=True, inplace=True)
    
    df_coverage['Year']=[year]*len(df_coverage)
    
    df_coverage['Vaccine Coverage']=df_coverage['Number of vaccinated']/(df_coverage['TOTAL Population']+df_coverage['TOTAL Slaughtered Population'])
    df_coverage['Vaccine Coverage']=[i if i<=1 else 1 if i==i else np.nan for i in df_coverage['Vaccine Coverage']]
    df_coverage['Source'] = df_coverage['Derived_Vaccinated_Method'].apply(
        lambda x: 'WAHIS country-level report; FAOSTAT (linear interpolation between years)' if x != 'None' else 'WAHIS country-level report; FAOSTAT'
    )
    df_coverage.drop(columns=['Derived_Vaccinated_Method'],inplace=True)

    
    df_coverage=df_coverage[df_coverage['Vaccine Coverage']>=0] #making sure properly formatted after df_start
    
    VC_lower=[]
    VC_upper=[]
    
    for row in df_coverage.iterrows():
            l,u=binomtest(int(round(row[1]['Vaccine Coverage']*(row[1]['TOTAL Population']+row[1]['TOTAL Slaughtered Population']))),int(round((row[1]['TOTAL Population']+row[1]['TOTAL Slaughtered Population'])))).proportion_ci()
            VC_lower+=[l]
            VC_upper+=[u]
    
    df_coverage['Vaccine Coverage Lower']=VC_lower
    df_coverage['Vaccine Coverage Upper']=VC_upper
    
    df_part=df_part_full.copy()
    
    df_part=df_part[df_part['Year']<=year]
    
    df_part=df_part.drop_duplicates(subset=['Disease', 'Country'], keep='last')
    
    
    codes_part = [countries.get(country, 'Unknown code:'+country) for country in df_part['Country']]
    
    for code in codes_part:
        if "Unknown" in code:
            print("FIX THIS:",code)
        
    iso3s_part=[]
    
    for i in df_part['Country']:
        try:
            iso3s_part+=[countries[i]]
        except:
            iso3s_part+=[None]
    
    df_part['ISO3']=iso3s_part
    
    #print(np.unique(codes_part))
    
    index_keep=[]
    
    for row in df_part.iterrows():
        if (row[1]['Disease'] not in df_coverage[df_coverage['ISO3']==row[1]['ISO3']]['Disease'].values):
            index_keep+=[row[0]]
    df_part=df_part.loc[index_keep] 
    df_part['Vaccine Coverage']=[i if i<=1 else 1 if i==i else np.nan for i in df_part['Vaccine Coverage']]
    df_part=df_part[df_part['Vaccine Coverage']>=0]
    
    VC_lower=[]
    VC_upper=[]
    
    for row in df_part.iterrows():
            l,u=binomtest(int(round(row[1]['Vaccine Coverage']*(row[1]['Adjusted_Susceptible']))),int(round(row[1]['Adjusted_Susceptible']))).proportion_ci()
            VC_lower+=[l]
            VC_upper+=[u]
    
    df_part['Vaccine Coverage Lower']=VC_lower
    df_part['Vaccine Coverage Upper']=VC_upper
    
    add_to_coverage_df=[]
    
    for row in df_part.iterrows():
       
        if row[1]['Derived_Vaccinated_Method']!='None':
            add_to_coverage_df+=[[row[1]['ISO3'],row[1]['Country'],year,None,
                             row[1]['Disease'], None, None,None,
                     None, None,  row[1]['Year Range'], None, None,
                             row[1]['Vaccine Coverage'],
                     'WAHIS administrative division reports (includes linear interpolation)',row[1]['Vaccine Coverage Lower'],
                     row[1]['Vaccine Coverage Upper']]]    

        else:
            add_to_coverage_df+=[[row[1]['ISO3'],row[1]['Country'],year,None,
                             row[1]['Disease'], None, None,None,
                     None, None,  row[1]['Year Range'], None, None,
                             row[1]['Vaccine Coverage'],
                     'WAHIS administrative division reports',row[1]['Vaccine Coverage Lower'],
                     row[1]['Vaccine Coverage Upper']]]    
   
    add_to_coverage_df = pd.DataFrame(data=add_to_coverage_df,columns=df_coverage.columns)
    
    df_coverage = pd.concat([df_coverage, add_to_coverage_df], ignore_index=True)
    



    final_dfs+=[df_coverage]

    print(year, 'finished analysis')

2005 finished analysis
2006 finished analysis
2007 finished analysis
2008 finished analysis
2009 finished analysis
2010 finished analysis
2011 finished analysis
2012 finished analysis
2013 finished analysis
2014 finished analysis
2015 finished analysis
2016 finished analysis
2017 finished analysis
2018 finished analysis
2019 finished analysis
2020 finished analysis
2021 finished analysis
2022 finished analysis
2023 finished analysis
2024 finished analysis


In [14]:
years_data=pd.concat(final_dfs)
years_data=years_data.sort_values(by=['ISO3','Year','Disease','Vaccine Coverage'])
years_data['Disease']=[i if i!= 'Newcastle disease virus (Inf. with)' else 'Newcastle disease (velogenic)' for i in years_data['Disease']]
years_data = years_data[~((years_data['ISO3'] == 'SCG') & (years_data['Year'] > 2006))]
years_data.to_csv('2005-2024_full_swine_vaccine_coverage_by_country.csv',index=False)
