In [1]:
import pandas as pd
import re
import requests
from geopy.geocoders import Nominatim
import ast
import country_converter as coco
from pathlib import Path 
import geopandas as gpd

pd.options.mode.chained_assignment = None 

In [2]:
import os
cur_path = os.getcwd()
directory = cur_path
directory


'/home/amber/projects/religious_history_project_work'

In [3]:
with open(directory+'/api.txt') as f:
    lines = f.readlines()


In [4]:

API_KEY = lines[0]


Here I am loading the csv file which was generated by the scraping script. In this dataset, there are three columns:
- region, representing the geographical location of the religious group. This can be a country, city or a continent.
- Group, is the name of the religious group (or religious text, place). 
- details - contains information about the year range over which the religious group has existed (upto 2022CE), the source reporting the group, the designation (whether it is a religious group or a religious place or a religious text). 

All three columns needs some kind of cleaning and update before this data is ready for EDA. Below is a summary of all the cleaning work: 
 
1. Extracted the source from details, if present into a new column. 
2. Extracted the year range into a separate column, further split the range into a start and end date for easier tracing while plotting.
3. Removed leading and trailing whitespaces if present.
4. Made two new binary representation (0,1) columns representing if a 'Group' is a religious group or a religious place or both. 
5. The region information was ambiguous, as region can be city, country or a continent. This proved to be the most difficult cleaning task because:
    - some of the region names were old region names, that are not used any more, for example al-andalus (now Spain). I tried a few different packages, the google geocoding API worked best, as it was able to pick up regions using their former names. The api didn't work for all such regions, and I manually mapped the missing ones, using the nearest modern country name. 
    - The region information was used to create a new continent column using country convrerter library. 
    - There is no current country column, the latitude and logitude coordinates will be sufficient for mapping the groups. 

In [5]:
raw_data = pd.read_csv(directory+'/scraped_religion_data.csv')
raw_data = pd.DataFrame(raw_data)
raw_data

Unnamed: 0,region,Group,details
0,afghanistan,Chishti Sufis,"['Supervised by: Stephen Christopher', '1200 C..."
1,africa,African Initiated Churches,"['1900 CE - 2022 CE', 'Religious Group, pentec..."
2,africa,Ancient Egypt - Early Dynastic Period,"['2950 BCE - 2670 BCE', 'Religious Group, Afri..."
3,africa,Ancient Egypt - First Intermediate Period,"['2168 BCE - 2010 BCE', 'Religious Group, Afri..."
4,africa,Ancient Egypt - Old Kingdom,"['2670 BCE - 2168 BCE', 'Religious Group, Afri..."
...,...,...,...
2779,al-andalus,Qiṣṣat Dhī al-Qarnayn,"['850 CE - 1250 CE', 'Religious Group, Text, A..."
2780,al-andalus,"كتاب"" المقدّمة ""لابن خلدون livre Mou qaddîma d...","['1332 CE - 1406 CE', ""Religious Group, Text, ..."
2781,southern-levant,Religion in the Early Bronze Age / Canaan,"['3600 BCE - 2400 BCE', 'Religious Group, Syro..."
2782,southern-levant,Synagogue at Gamla,"['20 BCE - 67 CE', 'Religious Group, Religious..."


In [6]:
# extracting source, timeframe and attributes to become separate columns in the cleaned dataframe
extract_components = raw_data['details']
source = []
timeframe = []
attributes = []
for i in extract_components:
    lst = ast.literal_eval(i)
    if len(lst)==4:
        source.append(lst[0])
        timeframe.append(lst[2])
        attributes.append(lst[3])
    if len(lst)==3:
        source.append(lst[0])
        timeframe.append(lst[1])
        attributes.append(lst[2])
    if len(lst)==2:
        source.append("No Source")
        timeframe.append(lst[0])
        attributes.append(lst[1])

In [7]:
data_clean_1 = pd.DataFrame({'Region' :raw_data['region'], 'Name':raw_data['Group'],'Source':source,'TimeFrame':timeframe,'Attributes':attributes})

In [8]:
# remove leading and trailing whitespace
data_clean_1['Attributes'] = data_clean_1['Attributes'].str.strip()
#remove space between items separated by commas
data_clean_1['Attributes'] = data_clean_1['Attributes'].str.replace(r'\s*,\s*', ',', regex=True)

In [9]:
#creating two new binary columns for religious group and place designation in the cleaned dataframe
data_clean_1['ReligiousGroup'] = data_clean_1['Attributes'].apply(lambda x: 1 if 'Religious Group' in x else 0)
data_clean_1['ReligiousPlace'] = data_clean_1['Attributes'].apply(lambda x: 1 if 'Religious Place' in x else 0)
data_clean_1

Unnamed: 0,Region,Name,Source,TimeFrame,Attributes,ReligiousGroup,ReligiousPlace
0,afghanistan,Chishti Sufis,Supervised by: Stephen Christopher,1200 CE - 2020 CE,"Religious Group,Sufi,Islamic Traditions,South ...",1,0
1,africa,African Initiated Churches,No Source,1900 CE - 2022 CE,"Religious Group,pentecostal,African Religions,...",1,0
2,africa,Ancient Egypt - Early Dynastic Period,No Source,2950 BCE - 2670 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0
3,africa,Ancient Egypt - First Intermediate Period,No Source,2168 BCE - 2010 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0
4,africa,Ancient Egypt - Old Kingdom,No Source,2670 BCE - 2168 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0
...,...,...,...,...,...,...,...
2779,al-andalus,Qiṣṣat Dhī al-Qarnayn,No Source,850 CE - 1250 CE,"Religious Group,Text,Apocalyptic Literature an...",1,0
2780,al-andalus,"كتاب"" المقدّمة ""لابن خلدون livre Mou qaddîma d...",No Source,1332 CE - 1406 CE,"Religious Group,Text,فقه,Islamic Theology,Rule...",1,0
2781,southern-levant,Religion in the Early Bronze Age / Canaan,No Source,3600 BCE - 2400 BCE,"Religious Group,Syro-Palestinian Religion,Anci...",1,0
2782,southern-levant,Synagogue at Gamla,No Source,20 BCE - 67 CE,"Religious Group,Religious Place,Levantine Reli...",1,1


In [10]:
# attributes column, minus the information that has been assigned to separted new columns.
attributes_n = []
cleaned_attributes_dict = {}

for idx, attr in enumerate(data_clean_1['Attributes']):
    parts = attr.split(",")
    for each_attr in parts:
        if 'Religious Group' not in each_attr:
            if idx not in cleaned_attributes_dict:
                cleaned_attributes_dict[idx]=[each_attr]
            else:
                cleaned_attributes_dict[idx].append(each_attr)


# Value to remove
value_to_remove = 'Religious Place'

# Iterate over the dictionary and remove the value from the lists
for key, values in cleaned_attributes_dict.items():
    cleaned_attributes_dict[key] = [value for value in values if value != value_to_remove]
#cleaned_attributes     

In [11]:
cleaned_attributes = pd.Series(cleaned_attributes_dict)
data_clean_1['CleanedAttributes'] = cleaned_attributes

In [12]:
#handy tool for checking if nan are present in the data, just replace the column names with the column that needs to be checked
all_attrs = []
for idx,x_lst in enumerate(data_clean_1['CleanedAttributes']):
    if isinstance(x_lst, float):
        print(x_lst)
        print(idx)
        pass
    else:
        for x in x_lst:
            all_attrs.append(x)

nan
1308
nan
1467


In [13]:
#two rows found with nan for attributes, remove the above rows with no(nan) attributes
data_clean_1 = data_clean_1[data_clean_1['CleanedAttributes'].notna()]
data_clean_1 

Unnamed: 0,Region,Name,Source,TimeFrame,Attributes,ReligiousGroup,ReligiousPlace,CleanedAttributes
0,afghanistan,Chishti Sufis,Supervised by: Stephen Christopher,1200 CE - 2020 CE,"Religious Group,Sufi,Islamic Traditions,South ...",1,0,"[Sufi, Islamic Traditions, South Asian Religio..."
1,africa,African Initiated Churches,No Source,1900 CE - 2022 CE,"Religious Group,pentecostal,African Religions,...",1,0,"[pentecostal, African Religions, Christian Tra..."
2,africa,Ancient Egypt - Early Dynastic Period,No Source,2950 BCE - 2670 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]"
3,africa,Ancient Egypt - First Intermediate Period,No Source,2168 BCE - 2010 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]"
4,africa,Ancient Egypt - Old Kingdom,No Source,2670 BCE - 2168 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]"
...,...,...,...,...,...,...,...,...
2779,al-andalus,Qiṣṣat Dhī al-Qarnayn,No Source,850 CE - 1250 CE,"Religious Group,Text,Apocalyptic Literature an...",1,0,"[Text, Apocalyptic Literature and 6 more]"
2780,al-andalus,"كتاب"" المقدّمة ""لابن خلدون livre Mou qaddîma d...",No Source,1332 CE - 1406 CE,"Religious Group,Text,فقه,Islamic Theology,Rule...",1,0,"[Text, فقه, Islamic Theology, Rule Text, Relig..."
2781,southern-levant,Religion in the Early Bronze Age / Canaan,No Source,3600 BCE - 2400 BCE,"Religious Group,Syro-Palestinian Religion,Anci...",1,0,"[Syro-Palestinian Religion, Ancient Mediterran..."
2782,southern-levant,Synagogue at Gamla,No Source,20 BCE - 67 CE,"Religious Group,Religious Place,Levantine Reli...",1,1,"[Levantine Religion, Archaeological Site, Syna..."


In [14]:
# now work on splitting the TimeFrame column. Here only years are reported, so we'll keep the year in CE era as positive integers,
# and year in the BCE era as negative integers for creating a timeline for plotting.

# first split the year range into start and end year

data_clean_1['Start_year'] = data_clean_1['TimeFrame'].apply(lambda x: pd.Series(re.split(' - ',x)[0]))
data_clean_1['End_year'] = data_clean_1['TimeFrame'].apply(lambda x: pd.Series(re.split(' - ',x)[1]))

def year_by_era(year_with_era):
    match_str = re.compile('[0-9]+')
    m = match_str.findall(year_with_era)
    year_string = m[0]

    if 'BCE' in year_with_era:
        year = -(int(year_string)) 
    else:
        year = int(year_string)

    return year

In [15]:
data_clean_1['Start_year']= data_clean_1['Start_year'].apply(lambda x: year_by_era(x))
data_clean_1['End_year']= data_clean_1['End_year'].apply(lambda x: year_by_era(x))
data_clean_1

Unnamed: 0,Region,Name,Source,TimeFrame,Attributes,ReligiousGroup,ReligiousPlace,CleanedAttributes,Start_year,End_year
0,afghanistan,Chishti Sufis,Supervised by: Stephen Christopher,1200 CE - 2020 CE,"Religious Group,Sufi,Islamic Traditions,South ...",1,0,"[Sufi, Islamic Traditions, South Asian Religio...",1200,2020
1,africa,African Initiated Churches,No Source,1900 CE - 2022 CE,"Religious Group,pentecostal,African Religions,...",1,0,"[pentecostal, African Religions, Christian Tra...",1900,2022
2,africa,Ancient Egypt - Early Dynastic Period,No Source,2950 BCE - 2670 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2950,-2670
3,africa,Ancient Egypt - First Intermediate Period,No Source,2168 BCE - 2010 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2168,-2010
4,africa,Ancient Egypt - Old Kingdom,No Source,2670 BCE - 2168 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2670,-2168
...,...,...,...,...,...,...,...,...,...,...
2779,al-andalus,Qiṣṣat Dhī al-Qarnayn,No Source,850 CE - 1250 CE,"Religious Group,Text,Apocalyptic Literature an...",1,0,"[Text, Apocalyptic Literature and 6 more]",850,1250
2780,al-andalus,"كتاب"" المقدّمة ""لابن خلدون livre Mou qaddîma d...",No Source,1332 CE - 1406 CE,"Religious Group,Text,فقه,Islamic Theology,Rule...",1,0,"[Text, فقه, Islamic Theology, Rule Text, Relig...",1332,1406
2781,southern-levant,Religion in the Early Bronze Age / Canaan,No Source,3600 BCE - 2400 BCE,"Religious Group,Syro-Palestinian Religion,Anci...",1,0,"[Syro-Palestinian Religion, Ancient Mediterran...",-3600,-2400
2782,southern-levant,Synagogue at Gamla,No Source,20 BCE - 67 CE,"Religious Group,Religious Place,Levantine Reli...",1,1,"[Levantine Religion, Archaeological Site, Syna...",-20,67


In [30]:
data_clean_1[data_clean_1['Name'].str.contains('al-Masjed')]

Unnamed: 0,Region,Name,Source,TimeFrame,Attributes,ReligiousGroup,ReligiousPlace,CleanedAttributes,Start_year,End_year
577,asia,الكعبة : المسجد الحرام al-Ka'ba : al-Masjed al...,No Source,2000 BCE - 2023 CE,"Religious Group,Religious Place,Arabia,Abraham...",1,1,"[Arabia, Abrahamic, Arabian Religions, Islamic...",-2000,2023
825,western-asia,الكعبة : المسجد الحرام al-Ka'ba : al-Masjed al...,No Source,2000 BCE - 2023 CE,"Religious Group,Religious Place,Arabia,Abraham...",1,1,"[Arabia, Abrahamic, Arabian Religions, Islamic...",-2000,2023


## Getting Latitude and Longitude

This task was slightly more complicated than a simple call to the geocoding API due to the following reasons:
- some of the region names were old region names, that are not used any more, for example al-andalus (now Spain). I tried a few different packages, the google geocoding API worked best, as it was able to pick up some regions using their former names. The api didn't work for all such regions, and I manually mapped the missing ones, using the nearest modern country name.
    - I first found all the region names for which the API returned invalid results. Then I created a dictionary mapping these regions to the modern day country names. And then I used the API to get latitude longitude information for all regions and add it to the cleaned dataframe.

In [16]:
#On to geocoding now.
#This function takes a region name and returns latitude, longitude and country 
def get_coordinates(addresses):
    visited = []
    coordinate_dictionary = {}
    for address in addresses:
        if address == 'al-andalus':
            address = 'andalus'
        if address not in visited:
            params = {
                'key':API_KEY,
                'address' : address
            }

            base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
            response = requests.get(base_url, params = params).json()
            #print(response)
            if response['status'] == 'OK':
                coords = response['results'][0]['geometry']
                formatted_address = re.split(',',response['results'][0]['formatted_address'])
                #print(formatted_address)
                potential_country_name = formatted_address.pop().strip()
                #print(potential_country_name)
                if potential_country_name.isdigit():
                    #print("here")
                    country_name = formatted_address.pop().strip()
                else:
                    country_name = potential_country_name
                    
                lat = coords['location']['lat']
                lon = coords['location']['lng']
                if address not in coordinate_dictionary:
                    if address == 'andalus':
                        coordinate_dictionary['al-andalus']=[lat,lon,country_name]
                    else:
                        coordinate_dictionary[address]=[lat,lon,country_name]
            else:
                coordinate_dictionary[address] = [None,None,None]
            visited.append(address)
    return coordinate_dictionary

In [17]:
def get_invalid_region_names(addresses):
    visited = []
    nonexistent_list = []
    for address in addresses:
        if address not in visited:
            params = {
                'key':API_KEY,
                'address' : address
            }

            base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
            response = requests.get(base_url, params = params).json()
            #print(response)
            if response['status'] == 'OK':
                pass
            else:
                nonexistent_list.append(address)
            visited.append(address)
    return nonexistent_list

In [18]:
all_source_region = data_clean_1['Region'].tolist()
invalid_region_names = get_invalid_region_names(all_source_region)
invalid_region_unique = list(set(invalid_region_names))
invalid_region_unique

['transnational',
 'dalmatia',
 'jiankang',
 'el-asasif',
 'galilee',
 'jordan-valley',
 'gaul',
 'chaldea',
 'canaan',
 'phrygia',
 'al-andalus',
 'java',
 'mesoamerica',
 'prydain',
 'byzantium',
 'pictland',
 'boeotia',
 'worms',
 'kition',
 'global',
 'thrace',
 'persianate-world',
 'anatolia',
 'amdo',
 'iberia',
 'judean-desert',
 'amarna',
 'edom',
 'al-qadisiyah',
 'edessan-region',
 'caanan',
 'aegean',
 'asia-minor',
 'tamilakam',
 'occidente-de-mesoamérica',
 'marathon',
 'north-asia',
 'cybersect',
 'song-dynasty',
 'sichuan-basin',
 'magna-graecia',
 'lower-48',
 'international',
 'etruria',
 'babylonia']

In [31]:
manually_map_unknown = {'al-andalus':'andalus','al-qadisiyah': 'Iraq', 'amdo':'China', 'anatolia':'Turkey', 'north-asia':'Russia', 
                        'asia-minor': 'Turkey','antiloch': 'Greece','attica': 'Greece','laconia': 'Greece','aegean': 'Greece',
                        'naples': 'Greece','kition':'Cyprus',
       'byzantium':'Turkey', 'caanan':'Israel', 'canaan':'Israel', 'carthage':'Tunisia', 'jiankang':'China',
       'song-china':'China', 'dalmatia':'Croatia', 'edessan-region':'Syria', 'amarna':'Egypt',
       'white-monastery':'Egypt', 'el-asasif':'Egypt', 'galicia':'Spain', 'galilee':'Israel', 'gaul':'Israel',
       'worms':'Germany', 'global':'global', 'cybersect':'global', 'aegean':'Greece', 'boeotia':'Greece', 'marathon':'Greece',
       'thrace':'Greece', 'iberia':'Spain', 'international':'global', 'magna-graecia':'Italy', 'java':'Indonesia',
       'edom':'Israel', 'jordan-valley':'Jordan', 'judean-desert':'Israel', 'mesoamerica':'Belize',
       'babylonia':'Iraq', 'chaldea':'Iraq', 'occidente-de-mesoamérica':'Belize',
       'persianate-world':'Iran', 'phrygia':'Turkey', 'pictland':'Scotland', 'prydain':'Britian',
       'sichuan-basin':'China', 'song-dynasty':'China', 'southwest-china':'China', 'tamilakam':'India',
       'transnational':'global', 'lower-48':'USA', 'etruria':'Italy', 'kyushu':'Japan'}
#On to geocoding now.
#This function takes a region name and returns a dictionary containing current region name, latitude, longitude and 
#source name (region name from the source)
def get_coordinates2(addresses, manual_add_dict):
    visited = []
    coordinate_dictionary = {}
    for address in addresses:
        source_name = address
        value = manual_add_dict.get(address)
        if value:
            address = value
            
        if source_name not in visited:
            params = {
                'key':API_KEY,
                'address' : address
            }

            base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
            response = requests.get(base_url, params = params).json()
            #print(response)
            if response['status'] == 'OK':
                coords = response['results'][0]['geometry']
                formatted_address = re.split(',',response['results'][0]['formatted_address'])
                #print(formatted_address)
                potential_country_name = formatted_address.pop().strip()
                #print(potential_country_name)
                if potential_country_name.isdigit():
                    #print("here")
                    region_name = formatted_address.pop().strip()
                else:
                    region_name = potential_country_name
                    
                lat = coords['location']['lat']
                lon = coords['location']['lng']
                
                if 'CurrentRegion' not in coordinate_dictionary:
                    coordinate_dictionary['CurrentRegion'] = [region_name]
                else:
                    coordinate_dictionary['CurrentRegion'].append(region_name)
                if 'Latitude'not in coordinate_dictionary:
                    coordinate_dictionary['Latitude'] = [lat]
                else:
                    coordinate_dictionary['Latitude'].append(lat)

                if 'Longitude'not in coordinate_dictionary:
                    coordinate_dictionary['Longitude'] = [lon]
                else:
                    coordinate_dictionary['Longitude'].append(lon)
                if 'Region' not in coordinate_dictionary:
                    coordinate_dictionary['Region'] = [source_name]
                else:
                    coordinate_dictionary['Region'].append(source_name)
            else:
                if 'CurrentRegion' not in coordinate_dictionary:
                        coordinate_dictionary['CurrentRegion'] = [None]
                else:
                    coordinate_dictionary['CurrentRegion'].append(None)
                if 'Latitude'not in coordinate_dictionary:
                    coordinate_dictionary['Latitude'] = [None]
                else:
                    coordinate_dictionary['Latitude'].append(None)

                if 'Longitude'not in coordinate_dictionary:
                    coordinate_dictionary['Longitude'] = [None]
                else:
                    coordinate_dictionary['Longitude'].append(None)
                if 'Region' not in coordinate_dictionary:
                    coordinate_dictionary['Region'] = [source_name]
                else:
                    coordinate_dictionary['Region'].append(source_name)
        visited.append(source_name)
    return coordinate_dictionary

In [32]:

g = []
for e in invalid_region_unique:
    if e not in manually_map_unknown:
        g.append(e)
g

[]

In [33]:
addresses_list = list(data_clean_1['Region'])
coordinates_and_region_names_dict = get_coordinates2(addresses_list,manually_map_unknown)




In [34]:
coordinates_and_region_names_dict

{'CurrentRegion': ['Afghanistan',
  'Africa',
  'Central African Republic',
  'East Africa',
  'Egypt',
  'Central Africa',
  'North Africa',
  'Southern Africa',
  'West Africa',
  'Iraq',
  'Algeria',
  'China',
  'Türkiye',
  'USA',
  'Persian Gulf (also known as the Arabian Gulf)',
  'Arabia',
  'Asia',
  'Caucasus',
  'Central Asia',
  'East Asia',
  'Himalayas',
  'Asia',
  'Russia',
  'South Asia',
  'South East Asia',
  'Asia',
  'Asia',
  'Türkiye',
  'India',
  'Australia',
  'Iraq',
  'Balkans',
  'Bangladesh',
  'Iraq',
  'Borneo',
  'Brazil',
  'Türkiye',
  'Türkiye',
  'Israel',
  'Israel',
  'Canada',
  'Tunisia',
  'Azerbaijan',
  'China',
  'Vietnam',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'China',
  'Cyprus',
  'Cyprus',
  

In [35]:
coordinates_and_region_names_df = pd.DataFrame.from_dict(coordinates_and_region_names_dict)
coordinates_and_region_names_df


Unnamed: 0,CurrentRegion,Latitude,Longitude,Region
0,Afghanistan,33.939110,67.709953,afghanistan
1,Africa,-8.783195,34.508523,africa
2,Central African Republic,6.611111,20.939444,central-africa
3,East Africa,1.957709,37.297204,eastern-africa
4,Egypt,26.820553,30.802498,egypt
...,...,...,...,...
303,UK,52.130661,-3.783712,wales
304,Asia,34.047863,100.619655,west-asia
305,India,20.593684,78.962880,western-coast-of-india
306,Yemen,15.552727,48.516388,yemen


In [36]:
missing_rows = coordinates_and_region_names_df['Latitude'].isna()
coordinates_and_region_names_df[missing_rows]

Unnamed: 0,CurrentRegion,Latitude,Longitude,Region
118,,,,global
119,,,,cybersect
154,,,,international
275,,,,transnational


In [37]:
#save to a csv
filepath = Path(directory+"/lat_lon_region.csv")
coordinates_and_region_names_df.to_csv(filepath)

In [38]:
# --------
# adding this to allow resuming running this notebook after API requests
#----------------------
coordinates_and_region_names_df = pd.read_csv('lat_lon_region.csv', index_col = 0)
coordinates_and_region_names_df

Unnamed: 0,CurrentRegion,Latitude,Longitude,Region
0,Afghanistan,33.939110,67.709953,afghanistan
1,Africa,-8.783195,34.508523,africa
2,Central African Republic,6.611111,20.939444,central-africa
3,East Africa,1.957709,37.297204,eastern-africa
4,Egypt,26.820553,30.802498,egypt
...,...,...,...,...
303,UK,52.130661,-3.783712,wales
304,Asia,34.047863,100.619655,west-asia
305,India,20.593684,78.962880,western-coast-of-india
306,Yemen,15.552727,48.516388,yemen


In [39]:
coordinates_and_region_names_df[coordinates_and_region_names_df['Region'].str.contains('minor')]

Unnamed: 0,CurrentRegion,Latitude,Longitude,Region
27,Türkiye,38.963745,35.243322,asia-minor


In [40]:
data_clean_1

Unnamed: 0,Region,Name,Source,TimeFrame,Attributes,ReligiousGroup,ReligiousPlace,CleanedAttributes,Start_year,End_year
0,afghanistan,Chishti Sufis,Supervised by: Stephen Christopher,1200 CE - 2020 CE,"Religious Group,Sufi,Islamic Traditions,South ...",1,0,"[Sufi, Islamic Traditions, South Asian Religio...",1200,2020
1,africa,African Initiated Churches,No Source,1900 CE - 2022 CE,"Religious Group,pentecostal,African Religions,...",1,0,"[pentecostal, African Religions, Christian Tra...",1900,2022
2,africa,Ancient Egypt - Early Dynastic Period,No Source,2950 BCE - 2670 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2950,-2670
3,africa,Ancient Egypt - First Intermediate Period,No Source,2168 BCE - 2010 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2168,-2010
4,africa,Ancient Egypt - Old Kingdom,No Source,2670 BCE - 2168 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2670,-2168
...,...,...,...,...,...,...,...,...,...,...
2779,al-andalus,Qiṣṣat Dhī al-Qarnayn,No Source,850 CE - 1250 CE,"Religious Group,Text,Apocalyptic Literature an...",1,0,"[Text, Apocalyptic Literature and 6 more]",850,1250
2780,al-andalus,"كتاب"" المقدّمة ""لابن خلدون livre Mou qaddîma d...",No Source,1332 CE - 1406 CE,"Religious Group,Text,فقه,Islamic Theology,Rule...",1,0,"[Text, فقه, Islamic Theology, Rule Text, Relig...",1332,1406
2781,southern-levant,Religion in the Early Bronze Age / Canaan,No Source,3600 BCE - 2400 BCE,"Religious Group,Syro-Palestinian Religion,Anci...",1,0,"[Syro-Palestinian Religion, Ancient Mediterran...",-3600,-2400
2782,southern-levant,Synagogue at Gamla,No Source,20 BCE - 67 CE,"Religious Group,Religious Place,Levantine Reli...",1,1,"[Levantine Religion, Archaeological Site, Syna...",-20,67


In [41]:
data_clean_merged = pd.merge(data_clean_1, coordinates_and_region_names_df, on='Region', how='left')
missing_lat =  data_clean_merged['Latitude'].isna()
data_clean_merged['Region'][missing_lat].unique()

array(['global', 'cybersect', 'international', 'transnational'],
      dtype=object)

In [42]:
data_clean_merged

Unnamed: 0,Region,Name,Source,TimeFrame,Attributes,ReligiousGroup,ReligiousPlace,CleanedAttributes,Start_year,End_year,CurrentRegion,Latitude,Longitude
0,afghanistan,Chishti Sufis,Supervised by: Stephen Christopher,1200 CE - 2020 CE,"Religious Group,Sufi,Islamic Traditions,South ...",1,0,"[Sufi, Islamic Traditions, South Asian Religio...",1200,2020,Afghanistan,33.939110,67.709953
1,africa,African Initiated Churches,No Source,1900 CE - 2022 CE,"Religious Group,pentecostal,African Religions,...",1,0,"[pentecostal, African Religions, Christian Tra...",1900,2022,Africa,-8.783195,34.508523
2,africa,Ancient Egypt - Early Dynastic Period,No Source,2950 BCE - 2670 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2950,-2670,Africa,-8.783195,34.508523
3,africa,Ancient Egypt - First Intermediate Period,No Source,2168 BCE - 2010 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2168,-2010,Africa,-8.783195,34.508523
4,africa,Ancient Egypt - Old Kingdom,No Source,2670 BCE - 2168 BCE,"Religious Group,African Religions,Egyptian Rel...",1,0,"[African Religions, Egyptian Religions]",-2670,-2168,Africa,-8.783195,34.508523
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777,al-andalus,Qiṣṣat Dhī al-Qarnayn,No Source,850 CE - 1250 CE,"Religious Group,Text,Apocalyptic Literature an...",1,0,"[Text, Apocalyptic Literature and 6 more]",850,1250,Spain,37.544271,-4.727753
2778,al-andalus,"كتاب"" المقدّمة ""لابن خلدون livre Mou qaddîma d...",No Source,1332 CE - 1406 CE,"Religious Group,Text,فقه,Islamic Theology,Rule...",1,0,"[Text, فقه, Islamic Theology, Rule Text, Relig...",1332,1406,Spain,37.544271,-4.727753
2779,southern-levant,Religion in the Early Bronze Age / Canaan,No Source,3600 BCE - 2400 BCE,"Religious Group,Syro-Palestinian Religion,Anci...",1,0,"[Syro-Palestinian Religion, Ancient Mediterran...",-3600,-2400,Levant,34.075734,37.978459
2780,southern-levant,Synagogue at Gamla,No Source,20 BCE - 67 CE,"Religious Group,Religious Place,Levantine Reli...",1,1,"[Levantine Religion, Archaeological Site, Syna...",-20,67,Levant,34.075734,37.978459


## Get Continent


In [43]:
# get continent
converter = coco.CountryConverter()

In [44]:
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
#world = world.set_index("iso_a3")

world.head()


  world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))


Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry
0,889953.0,Oceania,Fiji,FJI,5496,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,58005463.0,Africa,Tanzania,TZA,63177,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,603253.0,Africa,W. Sahara,ESH,907,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,37589262.0,North America,Canada,CAN,1736425,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,328239523.0,North America,United States of America,USA,21433226,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


In [35]:
#world[world['continent']=="Oceania"]

In [36]:
#f = world['name'][world['iso_a3']=="ESP"].values[0]
#f

In [45]:
def get_continent(region_name):
    continent_list = ['Asia', 'Africa', 'North America', 'South America', 'Antarctica', 'Europe','Oceania']
    region_name = str(region_name.lower())
    try:
        cl= [c.lower() for c in continent_list]
        z = [idx for idx,continent in enumerate(cl) if continent in region_name]
        #print(z)
        if z:
            cont = continent_list[z[0]]
        else:
            standardized_name = converter.convert(names = region_name, to= 'ISO3')
            #print(standardized_name)
            cont = world['continent'][world['iso_a3']== standardized_name].values[0]
            #cont = (converter.convert(names = standardized_name,src = 'ISO2', to= 'continent')).lower()
       # print(cont)
        return cont
    except LookupError:
        return None
    

# Example usage
print(get_continent('France')) 

Europe


In [51]:
# the values below correspond to regions that map to more than one country, for now I 
#am assigning the groups in these regions to one of the country from 
# a list of countries that it may belogn to

manually_map_countries = {'Arabia':'Saudi Arabia',
'Caucasus':'Armenia',
'Himalayas' :'Nepal',
'Balkans':'Albania',
'Borneo' :'Indonesia',
'Borneo': 'Indonesia',
'Mediterranean Sea':'Palestine',
'Deir al Balah' :'Palestine',
'British Isles': 'UK',
'Scandinavia':'Sweden',
'Aegean Islands': 'Greece',
'Konkan': 'India',
'Caribbean':'Jamaica',
'Central America' :'Mexico',
'Maghreb': 'Morocco',
'Levant': 'Israel',
'Middle East': 'Syria',
'Melanesia' :'Oceania',
'Polynesia': 'Oceania',
'Micronesia' :'Oceania',
'Scotland' :'UK',
'Britian':'UK',
'Orinoco' :'Venezuela',
'Patagonia':'Argentina',
'Pyrenees' :'Spain',
'global':None,
'Hong Kong':'China',
'French Guiana':'Guyana',
'Arctic':"United States",
'USA': "United States"}

In [52]:
mask = data_clean_merged['CurrentRegion'].isin(manually_map_countries.keys())
indexes = data_clean_merged.loc[mask].index
indexes

Int64Index([ 283,  289,  290,  291,  292,  293,  580,  581,  582,  689,
            ...
            2739, 2740, 2741, 2742, 2743, 2744, 2749, 2779, 2780, 2781],
           dtype='int64', length=307)

In [53]:
data_clean_merged['CurrentRegion'] = data_clean_merged['CurrentRegion'].replace(manually_map_countries)

In [59]:
# fix the regions for ancient groups that have ancient names, which map to current US cities with the google API. 
# I discovered this while plotting for EDA, where some ancient Roman and Greek groups were being mapped to USA. 

#manually_map_current_region = {'antiloch':'Greece','attica':'Greece','aegean':'Greece', 'laconia':'Greece','naples':'Rome'}
#index_list_for_update= data_clean_merged[data_clean_merged['Region'].str.contains('antiloch|attica|laconia|aegean|naples')].index.tolist()
#data_clean_merged.loc[index_list_for_update, 'CurrentRegion'] = 'Greece'



In [54]:

data_clean_merged["Continent"] = data_clean_merged["CurrentRegion"].apply(lambda x: get_continent(x) if pd.notnull(x) else x)


In [55]:
data_clean_merged[data_clean_merged['Continent'].isna()]

Unnamed: 0,Region,Name,Source,TimeFrame,Attributes,ReligiousGroup,ReligiousPlace,CleanedAttributes,Start_year,End_year,CurrentRegion,Latitude,Longitude,Continent
1506,global,Church of Jesus Christ of Latter-day Saints (m...,No Source,1951 CE - 2018 CE,"Religious Group,Christian Traditions",1,0,[Christian Traditions],1951,2018,,,,
1507,global,Digital Shinto Communities,No Source,2001 CE - 2021 CE,"Religious Group,Kami worship,Asian American Re...",1,0,"[Kami worship, Asian American Religions, techn...",2001,2021,,,,
1508,global,International Society for Krishna Consciousnes...,No Source,1966 CE - 2016 CE,"Religious Group,Indic Religious Traditions",1,0,[Indic Religious Traditions],1966,2016,,,,
1509,global,Jehovah's Witnesses,No Source,1870 CE - 2022 CE,"Religious Group,Apocalyptic Movements,American...",1,0,"[Apocalyptic Movements, American Christianity,...",1870,2022,,,,
1510,global,Swaminarayan Sampraday,No Source,1800 CE - 2017 CE,"Religious Group,Indic Religious Traditions",1,0,[Indic Religious Traditions],1800,2017,,,,
1511,global,"The Church of Christ, Scientist",Secondary source: Multiple References,1866 CE - 2020 CE,"Religious Group,Christian Traditions,New Relig...",1,0,"[Christian Traditions, New Religious Movement]",1866,2020,,,,
1512,global,The International Network of Engaged Buddhists...,No Source,1989 CE - 2022 CE,"Religious Group,Buddhist Traditions",1,0,[Buddhist Traditions],1989,2022,,,,
1513,global,Universal Fellowship of Metropolitan Community...,No Source,1968 CE - 2020 CE,"Religious Group,Christian Traditions",1,0,[Christian Traditions],1968,2020,,,,
1514,global,Wesleyanism,No Source,1738 CE - 2020 CE,"Religious Group,Wesleyanism,Methodism",1,0,"[Wesleyanism, Methodism]",1738,2020,,,,
1515,global,Supreme Master Ching Hai World Society,No Source,1986 CE - 2018 CE,"Religious Group,Buddhist Traditions,Christian ...",1,0,"[Buddhist Traditions, Christian Traditions, In...",1986,2018,,,,


In [56]:
filepath2 = Path(directory+"/religion_data_for_EDA.csv")
data_clean_merged.to_csv(filepath2)
