- Study project of web scraping by gathering data about colonialism using Wikipedia articles
- Gathered data will be used for a map plotting study project

In [1]:
from table_scraping import *

# "Countries" dataframe

In [2]:
# Using the list of UN members as list of countries in the world
url = 'https://en.wikipedia.org/wiki/Member_states_of_the_United_Nations'

# Header for this page: <th scope="col"> | Cells in first column: <th scope="row">
data = get_dataframe(get_tables(url)[0], ['td', 'th'], {'scope' : 'col'})
data['Independence'] = None
data['Independence From'] = None

#
data = data.drop(['Date of admission', 'Original member', 'See also'], axis=1)
data = data.rename(columns={"Member state": 'Country'})
data


Unnamed: 0,Country,Independence,Independence From
0,Afghanistan,,
1,Albania,,
2,Algeria,,
3,Andorra,,
4,Angola,,
...,...,...,...
188,Bolivarian Republic of Venezuela,,
189,Viet Nam,,
190,Yemen,,
191,Zambia,,


In [3]:
# Renaming some countries
# Plurinational State of Bolivia
data.loc[data['Country'].str.contains(
    "Bolivia"), 'Country'] = 'Bolivia'  
# Brunei Darussalam
data.loc[data['Country'].str.contains(  
    "Brunei"), 'Country'] = 'Brunei'
# Democratic People's Republic of Korea
data.loc[data['Country'].str.contains(  
    "Democratic People's"), 'Country'] = 'North Korea'
# Islamic Republic of Iran
data.loc[data['Country'].str.contains(  
    "Iran"), 'Country'] = 'Iran'
# Lao People's Democratic Republic
data.loc[data['Country'].str.contains(  
    "Lao "), 'Country'] = 'Laos'
# Republic of Korea
data.loc[data['Country'].str.contains(  
    "Republic of Korea"), 'Country'] = 'South Korea'
# Republic of Moldova
data.loc[data['Country'].str.contains(  
    "Moldova"), 'Country'] = 'Moldova'
# Russian Federation
data.loc[data['Country'].str.contains(  
    "Russian"), 'Country'] = 'Russia'
# Syrian Arab Republic
data.loc[data['Country'].str.contains(  
    "Syrian"), 'Country'] = 'Syria'
# United Kingdom of Great Britain and Northern Ireland
data.loc[data['Country'].str.contains(  
    "United Kingdom"), 'Country'] = 'United Kingdom'
# United Republic of Tanzania
data.loc[data['Country'].str.contains(  
    "Tanzania"), 'Country'] = 'Tanzania'
# Bolivarian Republic of Venezuela
data.loc[data['Country'].str.contains(  
    "Venezuela"), 'Country'] = 'Venezuela'

data


Unnamed: 0,Country,Independence,Independence From
0,Afghanistan,,
1,Albania,,
2,Algeria,,
3,Andorra,,
4,Angola,,
...,...,...,...
188,Venezuela,,
189,Viet Nam,,
190,Yemen,,
191,Zambia,,


# Africa

In [4]:
url = 'https://en.wikipedia.org/wiki/Decolonisation_of_Africa'

In [5]:
# Gets table containing the dates and colonial powers of african countries
df = get_dataframe(get_tables(url)[6])
df

Unnamed: 0,Country,Date of acquisition of sovereignty,Acquisition of sovereignty
0,Algeria,3 July 1962,French recognition of Algerian referendum on i...
1,Angola,11 November 1975,Independence from Portugal
2,Benin,1 August 1960,Independence from France
3,Botswana,30 September 1966,Independence from the United Kingdom
4,Burkina Faso,5 August 1960,Independence from France
...,...,...,...
58,Uganda,1 March 1962,Self-government granted
59,Uganda,9 October 1962,Independence from the United Kingdom
60,Zambia,24 October 1964,Independence from the United Kingdom
61,Zimbabwe,11 November 1965,Unilateral declaration of independence by Sout...


In [6]:
# Rename columns to fit with the 'data' dataframe
df = df.rename(columns={"Date of acquisition of sovereignty": 'Independence',
                       'Acquisition of sovereignty': 'Independence From'})

# Independence from date to just year
df['Independence'] = df['Independence'].str.split(expand=True)[2]

# Removes duplicates
df = df.drop_duplicates(
    ['Country'], keep='last').reset_index(drop=True)

# Get some colonial powers
df['Independence From'] = df['Independence From'].str.replace(
    "Independence From|the| declared| recognized", "", case=False, regex=True)

# Corrections
# Algeria, Madagascar, Morocco
df.iloc[[0, 29, 34], 2] = 'France'

# Egypt, South Africa, Sudan, Tanzania, Zimbabwe
df.iloc[[14, 45, 47, 48, 53], 2] = 'United Kingdom'

# Ethiopia
df.iloc[18]['Independence From'] = 'None'

# Liberia (Colonized by "American Colonization Society")
df.iloc[27]['Independence From'] = 'United States'

# Libya, Somalia
df.iloc[[28, 44], 2] = 'Italy'

# Namibia
df.iloc[36]['Independence From'] = 'South Africa'

# South Sudan
df.iloc[46]['Independence From'] = 'Sudan'

# Dates
# Libya (Declared: 1947 | Kingdom established: 1951)
# df.iloc[28]['Independence'] = 1947

# Malawi (Dominion: 1964 | Republic: 1966)
df.iloc[30]['Independence'] = 1966

# Mauritius (Elizabeth II as head of state: 1968 - 1992 | Republic: 1992)
# df.iloc[33]['Independence'] = 1992

# Morocco (End of the French Protectorate announced: 1955 | Declaration: 1956)
df.iloc[34]['Independence'] = 1956

# Sierra Leone (Dominion: 1961 | Republic: 1971)
df.iloc[43]['Independence'] = 1971

# South Africa (Statute of Westminster: 1931 | Republic: 1961)
df.iloc[45]['Independence'] = 1961

# Tanzania (Tanganyika: 1961 | Zanzibar: 1963 | Merger: 1964)
# df.iloc[48]['Independence'] = 1964

# Uganda (Dominion: 1962 | Republic: 1963)
df.iloc[51]['Independence'] = 1963

df


Unnamed: 0,Country,Independence,Independence From
0,Algeria,1962.0,France
1,Angola,1975.0,Portugal
2,Benin,1960.0,France
3,Botswana,1966.0,United Kingdom
4,Burkina Faso,1960.0,France
5,Burundi,1962.0,Belgium
6,Cabo Verde,1975.0,Portugal
7,Cameroon,1960.0,France
8,Central African Republic,1960.0,France
9,Chad,1960.0,France


In [7]:
df.iloc[-1, 0]

'\xa0Zimbabwe'

In [8]:
df['Country'] = df['Country'].str.replace('\xa0', '')
df.iloc[-1, 0]


'Zimbabwe'

In [9]:
map_fill(data, df, 'Country', ['Independence', 'Independence From'])

data


Unnamed: 0,Country,Independence,Independence From
0,Afghanistan,,
1,Albania,,
2,Algeria,1962,France
3,Andorra,,
4,Angola,1975,Portugal
...,...,...,...
188,Venezuela,,
189,Viet Nam,,
190,Yemen,,
191,Zambia,1964,United Kingdom


# America

In [10]:
url = 'https://en.wikipedia.org/wiki/Decolonization_of_the_Americas'


In [11]:
df = get_dataframe(get_tables(url)[0])[
    ['Country', 'Colonial power', 'Independence date']]
df =  df.sort_values('Country').reset_index(drop=True)
df


Unnamed: 0,Country,Colonial power,Independence date
0,Antigua and Barbuda,United Kingdom,"November 1, 1981"
1,Argentina,Spanish Empire,"May 25, 1810 and July 9, 1816"
2,Bahamas,United Kingdom,"July 10, 1973"
3,Barbados,United Kingdom,"November 30, 1966"
4,Belize,United Kingdom,"September 21, 1981"
5,Bolivia,Spanish Empire,"August 6, 1825"
6,Brazil,Portuguese Empire,"August 29, 1825"
7,Canada,United Kingdom,"July 1, 1867"
8,Chile,Spanish Empire,"February 12, 1818"
9,Colombiaas part of Gran Colombia,Spanish Empire,"August 7, 1819"


In [12]:
df.iloc[0, 0]


'\xa0Antigua and Barbuda'

In [13]:
df['Country'] = df['Country'].str.replace('\xa0', '')
df.iloc[0, 0]


'Antigua and Barbuda'

In [14]:
df = df.rename(columns={"Independence date": 'Independence',
                                  'Colonial power': 'Independence From'})
df['Independence'] = df['Independence'].str.split(expand=True)[2]
# Second Dominican Republic Independence
df = df.drop(13).reset_index(drop=True)
df

Unnamed: 0,Country,Independence From,Independence
0,Antigua and Barbuda,United Kingdom,1981
1,Argentina,Spanish Empire,1810
2,Bahamas,United Kingdom,1973
3,Barbados,United Kingdom,1966
4,Belize,United Kingdom,1981
5,Bolivia,Spanish Empire,1825
6,Brazil,Portuguese Empire,1825
7,Canada,United Kingdom,1867
8,Chile,Spanish Empire,1818
9,Colombiaas part of Gran Colombia,Spanish Empire,1819
