In [118]:
## Wikipedia Scraping Notebook to retrieve resolutions mentioning the Secretary General of the United Nations
# by Clara Kuper
# April 2021

import requests
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# all UN resolutions hosted on wikipedia
resolutions = range(1,2401)

# prepare a data table
columns = ['Resolution', 'Date', 'Subject', 'Mentions', 'WikiText', 'WikiLink']
SG_df = pd.DataFrame(columns = columns, index = np.arange(0,len(SG_list)))

row = 0

# loop thorugh all resolutions
# prevent a loop when this was already run
try: 
    print(SG_list)
except:
    SG_list = []
    for r in resolutions:
        # time.sleep(5)
        # load the wiki page as text
        x = requests.get('https://en.wikipedia.org/wiki/United_Nations_Security_Council_Resolution_{}'.format(r))        
        # parse to beautiful soup
        soup = BeautifulSoup(x.text, 'html')
        # find the main text
        main_text = soup.find('div',{'class': 'mw-parser-output'})
        try:
            infobox = main_text.find('table', {'class': 'infobox'})
            p = main_text.get_text(' ', strip=True)
            p = p[:p.find('[ edit ]')].strip()
            p = p[:p.find('References')].strip()
            # check if text contains "secretary general"
            SG_contained = 'Secretary-General' in p
            if SG_contained:
                
                # check the sentences where the SG is mentioned
                s = p.split('.')
                f = np.where(['Secretary-General' in s[i] for i in range(0,len(s))])
                
                # if it does, save the URL
                SG_list.append(x.url)
                
                infobox_info = infobox.find_all('td', {'class': 'infobox-data'})
                date = infobox_info[0].text
                subject = infobox_info[3].text
                if ('voted' in subject) or ('Adopted' in subject):
                    subject = infobox_info[2].text
                # and information from the website
                SG_df.loc[row, 'Resolution'] = x.url.split('_')[-1]
                SG_df.loc[row, 'Date'] = date
                SG_df.loc[row, 'Subject'] = subject
                SG_df.loc[row, 'Mentions']= s[f[0][0]:f[0][-1]+1]
                SG_df.loc[row, 'WikiLink'] = x.url
                SG_df.loc[row, 'WikiText'] = p
                
                
                # find the sentence that mentions the SG
                row += 1
                
            if r%100 == 0:
                print(r)
        except:
            pass

        
    # print the list of all URLs
    df = SG_df.dropna(thresh = 2)
    
    # assign topic ids
    topics = np.unique(df.Subject)
    df['topicID'] = np.nan
    df.loc[:,'topicID'] = [np.argwhere(topics == df.Subject[n])[0][0] for n in df.index]
    # How long is this list of URLS?
    print('We found {} entries mentioning the secretary-general'.format(len(SG_list)))

970


In [121]:
# print the dataframe
df

Unnamed: 0,Resolution,Date,Subject,Mentions,WikiText,WikiLink
0,35,October 3 1947,The Indonesian question,[ 207 Code S/574 ( Document ) Subject The Indo...,UN Security Council Resolution 35 Date October...,https://en.wikipedia.org/wiki/United_Nations_S...
1,44,April 1 1948,The Palestine question,"[ 277 Code S/714, II ( Document ) Subject The ...",UN Security Council Resolution 44 Date April 1...,https://en.wikipedia.org/wiki/United_Nations_S...
2,52,June 22 1948,Atomic Energy: International Control,[ 325 Code S/852 ( Document ) Subject Atomic E...,UN Security Council Resolution 52 Date June 22...,https://en.wikipedia.org/wiki/United_Nations_S...
3,54,July 15 1948,The Palestine Question,[ The Resolution instructed the United Nation...,UN Security Council Resolution 54 Date July 15...,https://en.wikipedia.org/wiki/United_Nations_S...
4,57,18 September 1948,The Palestine Question,[ Shocked by the death of Count Folke Bernadot...,UN Security Council Resolution 57 Folke Bernad...,https://en.wikipedia.org/wiki/United_Nations_S...
...,...,...,...,...,...,...
956,2063,31 July 2012,Reports of the Secretary-General on the Sudan,"[ 6,819 Code S/RES/2063 ( Document ) Subject R...",UN Security Council Resolution 2063 Map of Dar...,https://en.wikipedia.org/wiki/United_Nations_S...
957,2065,12 September 2012,The situation in Sierra Leone,[ It also asked UNIPSIL to assist conflict-pr...,UN Security Council Resolution 2065 Street in ...,https://en.wikipedia.org/wiki/United_Nations_S...
958,2113,30 July 2013,Reports of the Secretary-General on the Sudan,[ 7013 Code S/RES/2113 ( Document ) Subject Re...,UN Security Council Resolution 2113 Date 30 Ju...,https://en.wikipedia.org/wiki/United_Nations_S...
959,2167,28 July 2014,On enhancing the relationship between the Unit...,[ The resolution originated through a debate i...,UN Security Council Resolution 2167 UN Securit...,https://en.wikipedia.org/wiki/United_Nations_S...


In [123]:
# save the dataframe
df.to_csv('SG_list.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [75]:
# s = p.split('.')
# f = np.where(['Secretary-General' in s[i] for i in range(0,len(s))])
# s[f[0][0]:f[0][-1]+1]

'\xa0China\xa0France\xa0Russia\xa0United Kingdom\xa0United States'

'18 September 1948'