## Data-Cleaning and Analysis: Go to think tank index

In [34]:
import pandas as pd
import pycountry
import xlsxwriter

In [35]:
pd.options.display.max_colwidth = 100

In [36]:
file_path = 'data/macgann_thinktank_index_neu.xlsx'

## 1.1: functions

In [37]:
# function to encode the data: 
def clean_org_names(think_tank):
    dicts = {"Ã³": "ó",
             "â€“": "",
             "Ã§Ãµ": "çõ",
             "Ã§Ã£": "çã",
             "Ãº" : "ú",
             "Ã¡" : "á",
             "Å¡": "š",
             "Å¾" : "ž",
             "â€™" : "'",
             "Ã©" : "e",
             "Ã‰" : "é",
             "ã´": "ô"}
    for key, val in dicts.items():
        think_tank = think_tank.lower()
        think_tank = think_tank.replace(key.lower(),val)
    return think_tank


In [38]:
#country_rec: function to recognize country names in strings
def country_rec(name):
    add_countrynames = ["Palestine", "Jerusalem", 
                        "Russia", "Czech Republic", 
                        "Moldova", "Macedonia", 
                        "Republic of Korea", "Kosovo",
                        "Vietnam", "Taiwan",
                        "Bolivia", "Tanzania",
                        "Iran", "Brazil",
                        "uk", "venezuela",
                        "perú", "us",
                        "côte d'ivoire", "uae",
                        "south korea", "cote d'ivoire",
                        "brunei", "phillippines",
                        "cape verde", "swaziland"]
    for country in pycountry.countries:
        if country.name.lower() in name.lower():
            return country.name
        else:
            for country_new in add_countrynames:
                if country_new.lower() in name.lower():
                    return country_new

## 1.2. load datasets

In [39]:
#load test datasets
df_regions = pd.read_excel(file_path,sheet_name='regions')
df_special = pd.read_excel(file_path,sheet_name='special_categories')
df_research = pd.read_excel(file_path,sheet_name='research_areas')

In [40]:
# pick one dataset
# delete spaces 
df_research['Think Tank'] = df_research['Think Tank'].str.strip()

## 2.1: Research Areas

In [41]:
#convert float to int:
#df_research['Rank'] = df_research['Rank'].astype(int)
#df_research_test = df_research.tail(100)

In [42]:
#apply function 1 to check for country-names:
df_research['country'] = df_research['Think Tank'].apply(country_rec)

In [43]:
#apply function 2 to clean think tank names: 
df_research['Think Tank'] = df_research['Think Tank'].apply(clean_org_names)

In [44]:
#show the first 40
df_research.head(40)

Unnamed: 0,Rank,Think Tank,Category,Year,country
0,1.0,brookings institution usa,Top 10 International Development,2008,us
1,2.0,overseas development institute (odi) uk,Top 10 International Development,2008,uk
2,3.0,council on foreign relations usa,Top 10 International Development,2008,us
3,4.0,rand corporation us,Top 10 International Development,2008,us
4,5.0,woodrow wilson international center for scholars - usa,Top 10 International Development,2008,us
5,6.0,institute of development studies uk,Top 10 International Development,2008,uk
6,7.0,center for global development usa,Top 10 International Development,2008,us
7,8.0,international food policy research institute usa,Top 10 International Development,2008,us
8,9.0,german development institute germany,Top 10 International Development,2008,Germany
9,10.0,international policy network uk,Top 10 International Development,2008,uk


In [45]:
#show the last 40 
df_research.tail(40)

Unnamed: 0,Rank,Think Tank,Category,Year,country
7348,,"african centre for water research,south africa",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,South Africa
7349,,"centre for ecological research and forestry applications (creaf),spain",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,Spain
7350,,"international water management institute,sri lanka",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,Sri Lanka
7351,,"stockholm international water institute (siwi),sweden",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,Sweden
7352,,"fondation prince albert ii de monaco (fpa2),switzerland",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,Switzerland
7353,,"geneva water hub,switzerland",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,Switzerland
7354,,"east africa living lakes network,uganda",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,Uganda
7355,,"cabot institute, university of bristol, houses bristol's water initiative,united kingdom",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,us
7356,,"cranfield water science institute,united kingdom",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,United Kingdom
7357,,"environmental change institute, oxford,united kingdom",List of Water Security Think Tanks (Alphabetical Listing not ranked),2018,United Kingdom


In [46]:
df_research.loc[df_research['Category'].str.contains('United States'), 'country']= 'United States'

In [48]:
df_research[df_research['country'].isnull()]

Unnamed: 0,Rank,Think Tank,Category,Year,country
178,22.0,"korea development institute (kdi), (korea)",Top 25 International Development Think Tank,2010,
237,16.0,german institute for international and security aka stifung,Top 25 Security and International Affairs Think Tanks,2010,
256,10.0,ifo institute for economic research,Top 25 Domestic Economic Policy Think Tanks,2010,
287,16.0,"european centre for international political economy (ecipe),",Top 25 International Economic Policy Think Tanks,2010,
290,19.0,centre d'etudes prospectives et d'informations internationales,Top 25 International Economic Policy Think Tanks,2010,
345,24.0,samuel neaman institute for advanced studies in science and,Top 25 Science and Technology Think Tanks,2010,
689,34.0,stimson center (fna henry l,Top 70 Security and International Affairs Think Tanks,2012,
722,69.0,stimson center (fna henry l,Top 70 Security and International Affairs Think Tanks,2012,
728,6.0,united nations university world institute for development economics research (unu-wider),Top 80 International Development Think Tanks,2012,
863,63.0,"instituto de estudios avanzados en desarrollo (inesad), institute for advanced development",Top 70 Environment Think Tanks,2012,


In [49]:
writer = pd.ExcelWriter('data/zwischenstand2.xlsx', engine='xlsxwriter')
df_research.to_excel(writer,sheet_name='research_new')
writer.save()