In [None]:
import pickle
import pandas as pd
from pandas import DataFrame
import numpy as np
import re
import urllib
import bs4
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
# Converting wiki data into dataframe
wikidata = pickle.load(open('/Users/Macbook/Downloads/wiki_country_data.pkl','rb'))

wikiframe = DataFrame.from_dict(wikidata, 'index')
wikiframe.columns = ['Content']

# Identifying redirect pages by length of content
for i, row in wikiframe.iterrows():
    wikiframe.loc[i, 'len_Content'] = len(wikiframe.loc[i,'Content'])

# Identifying unique strings in the redirect pages
redirects1 = wikiframe['Content'].str.contains('#REDIRECT').fillna(False)
redirects2 = wikiframe['Content'].str.contains('#redirect').fillna(False)

# Dropping the redirect page countries from dataframe using the unique strings
wikiframe = wikiframe[wikiframe.Content.str.contains("#redirect") == False]
wikiframe = wikiframe[wikiframe.Content.str.contains("#REDIRECT") == False]

In [None]:
# Converting bank data into dataframe
bankframe = pd.read_excel('/Users/Macbook/Downloads/world_bank_country_data.xlsx')

bankframe = bankframe.rename(index=str, columns={'Country Name':'Country'})                          

In [None]:
# Identifying the country names that are unique to each dataframe
bankcountryset = set(bankframe['Country'])
wikicountryset = set(wikiframe.index)

onlyinbank = bankcountryset - wikicountryset
onlyinwiki = wikicountryset - bankcountryset

print(onlyinwiki)
print(onlyinbank)

In [None]:
# Homogenising the country names across both datasets

bankmapper = {
    "Russian Federation": "Russia",
    "Syrian Arab Republic": "Syria",
    "Brunei Darussalam": "Brunei",
    "Timor-Leste": "East Timor",
    "Slovak Republic": "Slovakia",
    "Yemen, Rep.": "Yemen",
    "Sao Tome and Principe" : "São Tomé and Príncipe",
    "St. Kitts and Nevis" : "Saint Kitts and Nevis",
    "Gambia, The" : "Gambia",
    "Micronesia, Fed. Sts." : "Micronesia",
    "Korea, Rep." : "Korea South",
    "Macedonia, FYR" : "Macedonia",
    "Congo, Rep." : "Congo, Democratic Republic of the",
    "St. Lucia" : "Saint Lucia",
    "Cabo Verde" : "Cape Verde",
    "Iran, Islamic Rep." : "Iran",
    "St. Vincent and the Grenadines" : "Saint Vincent and the Grenadines",
    "Lao PDR" : "Laos",
    "Bahamas, The" : "Bahamas",
    "Egypt, Arab Rep." : "Egypt",
    "Congo, Dem. Rep.": "Congo, Republic of the",
    "Venezuela, RB" : "Venezuela",
    "Cote d'Ivoire" : "Ivory Coast"
}


for index, value in bankframe['Country'].iteritems():
    if value in bankmapper.keys():
        bankframe.loc[index, 'Country'] = bankmapper[value]
        
# Dropping countries that are not in bankframe at all from wikiframe:

wiki_droplist = ['South Ossetia', 'Palestine', 'Cook Islands', 
            'Transnistria', 'Somaliland', 'Kyrgyzstan', 
            'Abkhazia', 'Taiwan', 'Vatican City', 
            'Northern Cyprus', 'Niue', 'Nagorno-Karabakh']

for i in wiki_droplist:
    wikiframe = wikiframe[wikiframe.index.str.contains(i) == False]

In [None]:
# Creating a master dataframe with relevant statistics

df = bankframe.merge(wikiframe,left_on="Country",right_index=True, how = 'inner')

# Replacing ".." cells with NaNs
df = df.replace(r'^(\.\.)', np.nan, regex=True)

# Replacing year names
df = df.rename(index=str, columns={'2013 [YR2013]':'2013'})
df = df.rename(index=str, columns={'2014 [YR2014]':'2014'})
df = df.rename(index=str, columns={'2015 [YR2015]':'2015'})
df = df.rename(index=str, columns={'2016 [YR2016]':'2016'})
df = df.rename(index=str, columns={'2017 [YR2017]':'2017'})

# Isolating statistics of interest
series_list = ['SG.VAW.REAS.ZS','SG.DMK.SRCR.FN.ZS','SH.STA.FGMS.ZS','SP.DYN.CONU.ZS'] 
df = df[df['Series Code'].isin(series_list)]  

In [None]:
# Creating an African subframe
afroframe = pd.read_excel('/Users/Macbook/Downloads/list-african-countries.xlsx')
afro_df = df[df['Country'].isin(afroframe['Country'])] 
afro_df = afro_df.drop('len_Content', axis=1)

afro_df['5year_avg'] = afro_df[['2013','2014','2015','2016','2017']].mean(axis=1)

# Dropping countries with no data and year columns
afro_df2 = afro_df.dropna(subset=['2013','2014','2015','2016','2017'], how='all')

afro_df2 = afro_df2.drop('2013', axis=1)
afro_df2 = afro_df2.drop('2014', axis=1)
afro_df2 = afro_df2.drop('2015', axis=1)
afro_df2 = afro_df2.drop('2016', axis=1)
afro_df2 = afro_df2.drop('2017', axis=1)

In [None]:
#Creating a pivot table from African subframe
afro_pivot = pd.pivot_table(afro_df2, index = ["Country"], columns = ["Series Name"])

#Cleaning up pivot table
column_mapper = {'Contraceptive prevalence, any methods (% of women ages 15-49)':'contra',
                'Female genital mutilation prevalence (%)':'FGM',
                'Women making their own informed decisions regarding sexual relations, contraceptive use and reproductive health care  (% of women age 15-49)':'decisions',
                'Women who believe a husband is justified in beating his wife (any of five reasons) (%)':'beating'}

afro_pivot = afro_pivot.rename(index=str, columns=column_mapper)

afro_pivot = afro_pivot.reset_index()
afro_pivot.columns = afro_pivot.columns.droplevel()
afro_pivot = afro_pivot.rename(index=str, columns={'':'Country'})

#Adding wikidata from previous dataframe
afro_df3 = afro_df2.drop_duplicates('Country')

afro_pivot=afro_pivot.merge(afro_df3, left_on="Country", right_on = "Country")
afro_pivot=afro_pivot.drop('Country Code', axis=1)
afro_pivot=afro_pivot.drop('Series Name', axis=1)
afro_pivot=afro_pivot.drop('Series Code', axis=1)
afro_pivot=afro_pivot.drop('5year_avg', axis=1)

In [None]:
#Creating additional column with lists of all inner links present in wikipedia pages

inner_links = re.compile(r'\[\[.*?\]\]')
afro_pivot['link_list'] = [inner_links.findall(i) for i in afro_pivot['Content']]
display(afro_pivot)

In [None]:
# Counting links between countries depending on level for each statistic

def link_counter(x):
    count_list=[]
    row_count=0
    for row in afro_pivot['link_list']:
        counter=0
        for string in row:
            for country in x:
                if country in afro_pivot.loc[row_count,'Country']:
                    pass
                else:
                    if country in string:
                        counter+=1
        count_list.append(counter)
        row_count+=1

    return count_list

#Contraception
high_contra = afro_pivot[afro_pivot['contra']>50]
low_contra = afro_pivot[afro_pivot['contra'] <=50]

high_contra_list=[i for i in high_contra['Country']]
low_contra_list=[i for i in low_contra['Country']]

afro_pivot['high_contra_links'] =  link_counter(high_contra_list)
afro_pivot['low_contra_links'] = link_counter(low_contra_list)

#FGM
high_FGM = afro_pivot[afro_pivot['FGM']>50]
low_FGM = afro_pivot[afro_pivot['FGM'] <=50]

high_FGM_list=[i for i in high_FGM['Country']]
low_FGM_list=[i for i in low_FGM['Country']]

afro_pivot['high_FGM_links'] =  link_counter(high_FGM_list)
afro_pivot['low_FGM_links'] = link_counter(low_FGM_list)

#decisions
high_decisions = afro_pivot[afro_pivot['decisions']>50]
low_decisions = afro_pivot[afro_pivot['decisions'] <=50]

high_decisions_list=[i for i in high_FGM['Country']]
low_decisions_list=[i for i in low_FGM['Country']]

afro_pivot['high_decisions_links'] =  link_counter(high_decisions_list)
afro_pivot['low_decisions_links'] = link_counter(low_decisions_list)

#beating
high_beating = afro_pivot[afro_pivot['beating']>50]
low_beating = afro_pivot[afro_pivot['beating'] <=50]

high_beating_list=[i for i in high_beating['Country']]
low_beating_list=[i for i in low_beating['Country']]

afro_pivot['high_beating_links'] =  link_counter(high_beating_list)
afro_pivot['low_beating_links'] = link_counter(low_beating_list)

In [None]:
# Identifying extreme countries for each statistic

# Contra
print(afro_pivot.loc[afro_pivot['contra']==afro_pivot['contra'].max(),'Country'])
print(afro_pivot.loc[afro_pivot['contra']==afro_pivot['contra'].min(),'Country'])

# FGM
print(afro_pivot.loc[afro_pivot['FGM']==afro_pivot['FGM'].max(),'Country'])
print(afro_pivot.loc[afro_pivot['FGM']==afro_pivot['FGM'].min(),'Country'])

# decisions
print(afro_pivot.loc[afro_pivot['decisions']==afro_pivot['decisions'].max(),'Country'])
print(afro_pivot.loc[afro_pivot['decisions']==afro_pivot['decisions'].min(),'Country'])

# beating
print(afro_pivot.loc[afro_pivot['beating']==afro_pivot['beating'].max(),'Country'])
print(afro_pivot.loc[afro_pivot['beating']==afro_pivot['beating'].min(),'Country'])

In [None]:
# Plotting extreme country link means

top_contra_stats = pd.DataFrame(index=['Zimbabwe','Chad'])
top_contra_stats.loc['Zimbabwe','high_links'] = 2
top_contra_stats.loc['Zimbabwe','low_links'] = 8

top_contra_stats.loc['Chad','high_links'] = 0
top_contra_stats.loc['Chad','low_links'] = 19

top_contra_stats.plot.bar()

top_FGM_stats = pd.DataFrame(index=['Togo','Mali'])
top_FGM_stats.loc['Togo','high_links'] = 1
top_FGM_stats.loc['Togo','low_links'] = 7

top_FGM_stats.loc['Mali','high_links'] = 7
top_FGM_stats.loc['Mali','low_links'] = 6

top_FGM_stats.plot.bar()

top_decisions_stats = pd.DataFrame(index=['Namibia','Senegal'])
top_decisions_stats.loc['Namibia','high_links'] = 0
top_decisions_stats.loc['Namibia','low_links'] = 0

top_decisions_stats.loc['Senegal','high_links'] = 9
top_decisions_stats.loc['Senegal','low_links'] = 3

top_decisions_stats.plot.bar()

top_beating_stats = pd.DataFrame(index=['Mali','Malawi'])
top_beating_stats.loc['Mali','high_links'] = 8
top_beating_stats.loc['Mali','low_links'] = 12

top_beating_stats.loc['Malawi','high_links'] = 2
top_beating_stats.loc['Malawi','low_links'] = 4

top_beating_stats.plot.bar()

In [None]:
# Creating a stats dataframe and barplots from overall link means

stats = pd.DataFrame(index=['contra','FGM','decisions','beating'])

stats.loc['contra','high-high']= high_contra['high_contra_links'].mean()
stats.loc['contra','high-low']= high_contra['low_contra_links'].mean()

stats.loc['FGM','low-low']= low_FGM['low_FGM_links'].mean()
stats.loc['FGM','low-high']= low_FGM['high_FGM_links'].mean()

stats.loc['decisions','high-high']= high_decisions['high_decisions_links'].mean()
stats.loc['decisions','high-low']= high_decisions['low_decisions_links'].mean()

stats.loc['beating','low-low']= low_beating['low_beating_links'].mean()
stats.loc['beating','low-high']= low_beating['high_beating_links'].mean()

stats.plot.bar()