In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re



#Pull the Global Peace Index table
#set the url to scrape
url = 'https://en.wikipedia.org/wiki/Global_Peace_Index'

#package and send the request, store as r
r = requests.get(url).text

#soup it
soup = BeautifulSoup(r, features='lxml')
#print(soup.prettify())

#find the table
MyTable = soup.find('table',{'class':'wikitable sortable'})

#Convert back to string for pandas
soup_string = str(MyTable)

#read the table html into pandas as a list
dfl = pd.read_html(soup_string)

#make a blank DataFrame and then move the dlf info onto it
df_peace = pd.DataFrame()
df_peace = dfl[0]


#Make it pretty
#sort by country name
df_peace = df_peace.sort_values(by=['Country'])

#print(df_peace.columns)

#we only want most recent data, drop the rest
#	Create a max cap variable no matter what year it is
colmax = len(df_peace.columns)
#	Drop all of the columns accept the first 3: 'country' 'most recent rank' 'most recent score'
df_peace = df_peace.drop(df_peace.columns[3:colmax], axis=1)

#clean the column names, wikipedia puts a citation in the score column
df_peace = df_peace.rename(columns={df_peace.columns[2] : df_peace.columns[2][0:10]})

#wikipedia puts an '=' in the rank column when two or more countries are tied, let's remove that
df_peace[df_peace.columns[1]] = df_peace[df_peace.columns[1]].str.replace('=', '')

#index by country name
df_peace = df_peace.set_index(df_peace.columns[0])

#the rank column is a string, make it an int
df_peace[df_peace.columns[0]] = df_peace[df_peace.columns[0]].astype(int)

print(df_peace.info())


<class 'pandas.core.frame.DataFrame'>
Index: 163 entries, Afghanistan to Zimbabwe
Data columns (total 2 columns):
2019 rank     163 non-null int64
2019 score    163 non-null float64
dtypes: float64(1), int64(1)
memory usage: 3.8+ KB
None


In [7]:
#Pull the Religion table
#set the url to scrape
url = 'https://www.pewforum.org/2015/04/02/religious-projection-table/2010/percent/all/'

#package and send the request, store as r
r = requests.get(url).text

#soup it
soup = BeautifulSoup(r, features='lxml')
#print(soup.prettify())


In [8]:
#find the table
MyTable = soup.find('table',{'class':'adaptive sortable stickyHeader highlight'})

#Convert back to string for pandas
soup_string = str(MyTable)

#read the table html into pandas as a list
dfl = pd.read_html(soup_string)

#make a blank DataFrame and then move the dlf info onto it
df_rel = pd.DataFrame()
df_rel = dfl[0]

#Make it pretty
#  We don't want continents or whole world data, so drop the first 7 rows
df_rel = df_rel.iloc[8:,]

#print(df_rel.info)

#manual fix for different names discovered later in the code
name_rep_dict = {'Bosnia-Herzegovina': 'Bosnia and Herzegovina',
                 'Ivory Coast': 'Cote d\'Ivoire',
                 'Czech Republic': 'Czechia',
                 'Timor-Leste': 'East Timor',
                 'Swaziland': 'Eswatini',
                 'Kyrgyzstan': 'Kyrgyz Republic',
                 'Burma \(Myanmar\)': 'Myanmar',
                 'Republic of Macedonia': 'North Macedonia',
                 'Palestinian territories': 'Palestine',
                 'Gambia': 'The Gambia'
                }
for key in name_rep_dict.keys():
    df_rel['Country'] = df_rel['Country'].str.replace(key, name_rep_dict[key])

#Bosnia and Herzegovina        81       2.019         NaN      NaN   
##Cote d'Ivoire                107       2.203         NaN      NaN   
##Czechia                       11       1.383         NaN      NaN   
##East Timor                    63       1.914         NaN      NaN   
##Eswatini                      72       1.986         NaN      NaN   
##Kyrgyz Republic               95       2.105         NaN      NaN   
##Myanmar                      125       2.393         NaN      NaN   
##North Macedonia               65       1.933         NaN      NaN   
##Palestine                    142       2.608         NaN      NaN   
##The Gambia                    62       1.908         NaN      NaN   

#make country name the new index value and sort by country name
df_rel = df_rel.sort_values(by=['Country'])

df_rel = df_rel.drop('Unnamed: 0', axis=1)

df_rel = df_rel.set_index('Country')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
#The Pew data has % and <> characters in every data column that makes them strings, we want floats
df_rel = df_rel.apply(lambda x: x.str.replace('%', ''))
df_rel = df_rel.apply(lambda x: x.str.replace('<', ''))
df_rel = df_rel.apply(lambda x: x.str.replace('>',''))
df_rel = df_rel.apply(lambda x: pd.to_numeric(x))


In [10]:
#concatenate the two dataframes
df = pd.concat([df_peace, df_rel], axis=1, sort=True)
#print(df.head())

#There's a bunch of NaN values now.  We need to clear rows with NaN in rank and score but we will do this later.
#First, I noticed that some of the entries were actually present, but named differently between the two data sets.  Let's fix that first manually because I don't know a better way yet.

#Checking if there were any entries with data from GPI but not the Pew data
#make a boolean and then index of entries with a numeric value (not NaN) in column 5 (picked at random)
sancheck = df[df.columns[5]] >= 0
dropthese = df[sancheck].index

#  Delete these rows
dfsancheck = df.drop(dropthese)
        
#Should print 'empty dataframe'.  If not, check names for errors
print(dfsancheck)

#turns out these are in the Pew data, just under a different name.
##Bosnia and Herzegovina        81       2.019         NaN      NaN   
##Cote d'Ivoire                107       2.203         NaN      NaN   
##Czechia                       11       1.383         NaN      NaN   
##East Timor                    63       1.914         NaN      NaN   
##Eswatini                      72       1.986         NaN      NaN   
##Kyrgyz Republic               95       2.105         NaN      NaN   
##Myanmar                      125       2.393         NaN      NaN   
##North Macedonia               65       1.933         NaN      NaN   
##Palestine                    142       2.608         NaN      NaN   
##The Gambia                    62       1.908         NaN      NaN   

#This commented out section is what I got the first time I ran this check.  With the code retroactively fixed, the output now should be nothing.

print(df.info())

Empty DataFrame
Columns: [2019 rank, 2019 score, Christians, Muslims, Unaffiliated, Hindus, Buddhists, Folk Religions, Other, Jews, All]
Index: []
<class 'pandas.core.frame.DataFrame'>
Index: 234 entries, Afghanistan to Zimbabwe
Data columns (total 11 columns):
2019 rank         163 non-null float64
2019 score        163 non-null float64
Christians        234 non-null float64
Muslims           234 non-null float64
Unaffiliated      234 non-null float64
Hindus            234 non-null float64
Buddhists         234 non-null float64
Folk Religions    234 non-null float64
Other             234 non-null float64
Jews              234 non-null float64
All               234 non-null float64
dtypes: float64(11)
memory usage: 31.9+ KB
None


In [11]:
#Great, but there are still a lot of NaN values in rank and score because the Pew Data Research had more entries than the GPI, we need to drop these rows now.
df = df.dropna()

#also, for reasons I can't understand, the rank column was turned into a float when we concatenated, make it an int
df[df.columns[0]] = df[df.columns[0]].astype(int)
print(df.info())

#Now we have a clean 163 entries again, the same as the original GPI data
assert df.notnull().all().all()
assert (df >= 0).all().all()

<class 'pandas.core.frame.DataFrame'>
Index: 163 entries, Afghanistan to Zimbabwe
Data columns (total 11 columns):
2019 rank         163 non-null int64
2019 score        163 non-null float64
Christians        163 non-null float64
Muslims           163 non-null float64
Unaffiliated      163 non-null float64
Hindus            163 non-null float64
Buddhists         163 non-null float64
Folk Religions    163 non-null float64
Other             163 non-null float64
Jews              163 non-null float64
All               163 non-null float64
dtypes: float64(10), int64(1)
memory usage: 15.3+ KB
None


In [12]:
#Let's export this as a CSV and then make a new program to import it and run the analysis
df.to_csv('gpi_by_rel.csv')