## Data-Cleaning and Analysis: Go to think tank index

In [3]:
import pandas as pd
import pycountry
import xlsxwriter

In [4]:
pd.options.display.max_colwidth = 100

In [5]:
file_path = 'data/macgann_thinktank_index_neu.xlsx'

## 1.1: functions

In [7]:
# function to encode the data: 
def clean_org_names(think_tank):
    dicts = {"Ã³": "ó",
             "â€“": "",
             "Ã§Ãµ": "çõ",
             "Ã§Ã£": "çã",
             "Ãº" : "ú",
             "Ã¡" : "á",
             "Å¡": "š",
             "Å¾" : "ž",
             "â€™" : "'",
             "Ã©" : "e",
             "Ã‰" : "é",
             "ã´": "ô"}
    for key, val in dicts.items():
        think_tank = think_tank.lower()
        think_tank = think_tank.replace(key.lower(),val)
    return think_tank



In [20]:
#country_rec: function to recognize country names in strings
def country_rec(name):
    add_countrynames = ["Palestine", "Jerusalem", 
                        "Russia", "Czech Republic", 
                        "Moldova", "Macedonia", 
                        "Republic of Korea", "Kosovo",
                        "Vietnam", "Taiwan",
                        "Bolivia", "Tanzania",
                        "Iran", "Brazil",
                        "uk", "venezuela",
                        "perú", "us",
                        "côte d'ivoire", "uae",
                        "south korea", "cote d'ivoire",
                        "brunei", "phillippines",
                        "cape verde", "swaziland"]
    for country in pycountry.countries:
        if country.name.lower() in name.lower():
            return country.name
        else:
            for country_new in add_countrynames:
                if country_new.lower() in name.lower():
                    return country_new

## 1.2. load datasets

In [21]:
#load test datasets
df_regions = pd.read_excel(file_path,sheet_name='regions')
df_special = pd.read_excel(file_path,sheet_name='special_categories')
df_research = pd.read_excel(file_path,sheet_name='research_areas')

In [22]:
# pick one dataset
# delete spaces 
df_special['Think tank'] = df_special['Think tank'].str.strip()

In [23]:
df_special[df_special['Think tank'].isnull()]

Unnamed: 0,Rank,Think tank,Category,Year
9585,9.0,,Think Tanks with the Most Significant Impact on Public Policy,2018


In [24]:
df_special = df_special.dropna(subset=['Think tank'])

## 2.1: Special Categories

In [25]:
#convert float to int:
#df_special['Rank'] = df_special['Rank'].astype(int)
#df_special_test = df_special.tail(100)

In [26]:
#apply function 1 to check for country-names:
df_special['country'] = df_special['Think tank'].apply(country_rec)

In [27]:
#apply function 2 to clean think tank names: 
df_special['Think tank'] = df_special['Think tank'].apply(clean_org_names)

In [28]:
#show the first 40
df_special.head(40)

Unnamed: 0,Rank,Think tank,Category,Year,country
0,1.0,cato institute usa,Think Tanks with the Most Innovative Policy/Idea Proposal,2008,us
1,2.0,brookings institute usa,Think Tanks with the Most Innovative Policy/Idea Proposal,2008,us
2,3.0,carnegie endowment for international peace usa,Think Tanks with the Most Innovative Policy/Idea Proposal,2008,us
3,1.0,european council on foreign relations belgium,Best New Think Tank (established in the last three-five years),2008,Belgium
4,2.0,bruegel belgium,Best New Think Tank (established in the last three-five years),2008,Belgium
5,3.0,center for american progress usa,Best New Think Tank (established in the last three-five years),2008,us
6,1.0,brookings institution usa,Outstanding Policy Oriented - Public Policy Research Program,2008,us
7,2.0,peterson institute for international economics usa,Outstanding Policy Oriented - Public Policy Research Program,2008,us
8,3.0,rand corporation usa,Outstanding Policy Oriented - Public Policy Research Program,2008,us
9,1.0,council on foreign relations usa,Best Use of the Internet to Engage the Public,2008,us


In [29]:
#show the last 40 
df_special.tail(40)

Unnamed: 0,Rank,Think tank,Category,Year,country
10027,35.0,regional centre for strategic studies (sri lanka),Best Regional Studies Center (Free Standing),2018,Sri Lanka
10028,36.0,organization for social science research in eastern and southern africa (ethiopia),Best Regional Studies Center (Free Standing),2018,Ethiopia
10029,37.0,ghana center for democratic development (ghana),Best Regional Studies Center (Free Standing),2018,Ghana
10030,38.0,russian institute of europe (russia),Best Regional Studies Center (Free Standing),2018,Russia
10031,39.0,india center for the study of developing societies (india),Best Regional Studies Center (Free Standing),2018,India
10032,40.0,sheikh saud bin saqr al qasimi foundation for policy research (united arab emirates),Best Regional Studies Center (Free Standing),2018,United Arab Emirates
10033,41.0,hammurabi for research & strategic studies (iraq),Best Regional Studies Center (Free Standing),2018,Iraq
10034,42.0,kyrgystan institute for regional studies (kyrgyzstan),Best Regional Studies Center (Free Standing),2018,Kyrgyzstan
10035,1.0,brookings institution-tsinghua center for public policy (btc) (china),Best Regional Studies Center (University-Affiliated),2018,China
10036,2.0,"european institute, london school of economics and political science (united kingdom)",Best Regional Studies Center (University-Affiliated),2018,United Kingdom


In [31]:
df_special[df_special['country'].isnull()]

Unnamed: 0,Rank,Think tank,Category,Year,country
90,14.0,"casablanca institute, (morrocco)",Best New Think Tanks (Established in the last 18 months),2010,
153,12.0,peterson institute for international economics (fna) institute for,Best External Relations / Public Engagement Program,2010,
308,17.0,audace institut afrique (aia) cote d'ivoire,Best New Think Tanks (Established in the last 18 months),2011,
363,22.0,new america foundation untied states,Think Tanks with the Best Use of the Internet or Social Media to engage the Public,2011,
626,10.0,"audace institut afrique (aia) (ivory coast),,",Best New Think Tanks,2012,
722,36.0,"centro de divulgación del conocimiento económico para la libertad (cedice libertad),,",Think Tanks with the Best Use of the Internet or Social Media,2012,
785,24.0,centro de implementación de polã­ticas públicas para la equidad y el crecimiento (cippec),Think Tanks with the Best External Relations/Public Engagement Program,2012,
859,48.0,centro de divulgación del conocimiento económico para la libertad (cedice libertad),Think Tanks with the Most Significant Impact on Public Policy,2012,
884,8.0,"center for transatlantic relations (ctr), school of advanced international studies (sais), johns",Best University Affiliated Think Tanks,2012,
913,37.0,"centre for security, economics, technology (c set), university of st",Best University Affiliated Think Tanks,2012,


In [32]:
writer = pd.ExcelWriter('data/zwischenstand3.xlsx', engine='xlsxwriter')
df_special.to_excel(writer,sheet_name='special_new')
writer.save()