In [1]:
import pandas as pd
import glob
import re

In [2]:
raw_country_religion_files = '../data/raw/country_religion_files/*.csv'
parsed_country_religion_file = '../data/parsed/country_religion_files/parsed_country_religion.csv'
raw_country_religion_search_regex = 'Worldwide(.+?)Population2010'
raw_country_religion_population_regex = '[^0-9]'
raw_country_religion_file_paths = glob.glob(raw_country_religion_files)

# Create a dictionary with all the different religious affiliations
country_religion_rename_dict = {
    'iso_code': 'ID',
    ' name': 'Country',
    ' value': 'Population'
}
country_religion_affiliation_fix_dict = {
    'Buddhist': 'Buddhist',
    'Christian': 'Christian',
    'FolkReligion': 'Folk',
    'Hindu': 'Hindu',
    'Jewish': 'Jewish',
    'Muslim': 'Muslim',
    'OtherReligion': 'Other',
    'Unaffiliated': 'Atheist'
}
affiliation_religion_correspondence_dict = {
    'Buddhist': 'Buddhism',
    'Christian': 'Christianity',
    'Folk': 'Folk',
    'Hindu': 'Hinduism',
    'Jewish': 'Judaism',
    'Muslim': 'Islam',
    'Other': 'Other',
    'Atheist': 'Atheism'
}
country_religion_df_dict = {}
for file_path in raw_country_religion_file_paths:
    religious_affiliation = re.search(raw_country_religion_search_regex, file_path).group(1)
    religious_affiliation = country_religion_affiliation_fix_dict[religious_affiliation]
    with open(file_path) as country_religion_file:
        # read dataframe and ignore NaNs
        country_religion_df = pd.read_csv(country_religion_file, index_col=False).dropna()
        country_religion_df = country_religion_df.rename(columns=country_religion_rename_dict)

        # Add parse Population column
        country_religion_df['Population'] = country_religion_df['Population'].apply(lambda x: re.sub(raw_country_religion_population_regex, '', x)).astype(int)
        
        # Force unique IDs
        country_religion_df = country_religion_df.groupby(['ID', 'Country'])['Population'].mean()
        country_religion_df = country_religion_df.reset_index()
        
        # Add to dictionary
        country_religion_df_dict[religious_affiliation] = country_religion_df

In [3]:
# Define religious affiliations to drop afterwards
dropable_religious_affiliations = [
    'Folk',
    'Other'
]

# Merge all the dataframes into a single one
country_religion_df = None
for religious_affiliation, religion_df in country_religion_df_dict.items():
    if country_religion_df is None:
        country_religion_df = religion_df
    else:
        country_religion_df = country_religion_df.merge(religion_df, left_on=['ID', 'Country'], right_on=['ID', 'Country'], how='inner')
    country_religion_df = country_religion_df.rename(columns={'Population': religious_affiliation})

# Drop Country column and transpose dataframe
country_religion_df = country_religion_df.drop(['Country'], axis=1)
country_religion_df = country_religion_df.set_index('ID').transpose()

# Create new Population column
total_population_series = country_religion_df.sum(axis=1)

# Calculate country-probability ratios
country_religion_df = country_religion_df.div(total_population_series, axis=0)

# Drop religious affiliations
country_religion_df = country_religion_df.drop(dropable_religious_affiliations, axis=0)

# Final formatting operations to dataframe
country_religion_df = country_religion_df.reset_index()
country_religion_df = country_religion_df.rename(columns={'index':'Affiliation'})
country_religion_df.insert(0, 'Religion', country_religion_df['Affiliation'].apply(lambda x: affiliation_religion_correspondence_dict[x]))

# Sample
country_religion_df

ID,Religion,Affiliation,AD,AE,AF,AG,AI,AL,AM,AO,...,VI,VN,VU,WF,WS,YE,YT,ZA,ZM,ZW
0,Buddhism,Buddhist,2e-05,0.000307,2e-05,2e-05,2e-05,2e-05,2e-05,2e-05,...,2e-05,0.029393,2e-05,2e-05,2e-05,2e-05,2e-05,0.000204,2e-05,2e-05
1,Christianity,Christian,3.9e-05,0.000438,2.1e-05,4.2e-05,1.2e-05,0.00027,0.001407,0.007967,...,5.1e-05,0.003308,0.000106,1.2e-05,8.8e-05,2.3e-05,1.2e-05,0.018778,0.005891,0.005042
2,Hinduism,Hindu,1e-05,0.000474,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,...,1e-05,1e-05,1e-05,1e-05,1e-05,0.000145,1e-05,0.000551,1e-05,1e-05
3,Judaism,Jewish,0.000637,0.000637,0.000637,0.000637,0.000637,0.000637,0.000637,0.000637,...,0.000637,0.000637,0.000637,0.000637,0.000637,0.000637,0.000637,0.004461,0.000637,0.000637
4,Islam,Muslim,6e-06,0.003611,0.019575,6e-06,6e-06,0.001606,6e-06,2.5e-05,...,6e-06,0.0001,6e-06,6e-06,6e-06,0.014889,0.000125,0.000537,4.4e-05,6.9e-05
5,Atheism,Atheist,9e-06,8e-05,9e-06,9e-06,9e-06,4.4e-05,3.5e-05,0.000866,...,9e-06,0.023011,9e-06,9e-06,9e-06,1.8e-05,9e-06,0.006583,6.2e-05,0.000875


In [4]:
# Write to file
country_religion_df.to_csv(parsed_country_religion_file, index=False, encoding='utf-8', compression='gzip')