- Study project of web scraping by gathering data about colonialism using Wikipedia articles
- Gathered data will be used for a data visualization study project

In [1]:
from table_scraping import *

# "Countries" dataframe

In [2]:
# Using the list of UN members as list of countries in the world
url = 'https://en.wikipedia.org/wiki/Member_states_of_the_United_Nations'

# Other possible source (Tables by continent)
# url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_by_date_of_formation'

# Header for this page: <th scope="col"> | Cells in first column: <th scope="row">
data = get_dataframe(get_tables(url)[0], ['td', 'th'], {'scope' : 'col'})
data['Independence'] = np.nan
data['Independence From'] = np.nan
data['Main Colonial Power'] = np.nan
data['Continent'] = np.nan  

# Finishing base dataframe
data = data.drop(['Date of admission', 'Original member', 'See also'], axis=1)
data = data.rename(columns={"Member state": 'Country'})
data


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,,,,
1,Albania,,,,
2,Algeria,,,,
3,Andorra,,,,
4,Angola,,,,
...,...,...,...,...,...
188,Bolivarian Republic of Venezuela,,,,
189,Viet Nam,,,,
190,Yemen,,,,
191,Zambia,,,,


In [3]:
# Adding observer states
data = data.append({'Country': 'Vatican City'}, ignore_index=True)
data = data.append({'Country': 'Palestine'}, ignore_index=True)

data = data.sort_values('Country').reset_index(drop=True)


In [4]:
# Renaming some countries
# Plurinational State of Bolivia
data.loc[data['Country'].str.contains(
    "Bolivia"), 'Country'] = 'Bolivia'  
# Brunei Darussalam
data.loc[data['Country'].str.contains(  
    "Brunei"), 'Country'] = 'Brunei'
# Congo
data.iloc[38, 0] = 'Republic of the Congo'
# Democratic People's Republic of Korea
data.loc[data['Country'].str.contains(  
    "Democratic People's"), 'Country'] = 'North Korea'
# Islamic Republic of Iran
data.loc[data['Country'].str.contains(  
    "Iran"), 'Country'] = 'Iran'
# Lao People's Democratic Republic
data.loc[data['Country'].str.contains(  
    "Lao "), 'Country'] = 'Laos'
# Republic of Korea
data.loc[data['Country'].str.contains(  
    "Republic of Korea"), 'Country'] = 'South Korea'
# Republic of Moldova
data.loc[data['Country'].str.contains(  
    "Moldova"), 'Country'] = 'Moldova'
# Russian Federation
data.loc[data['Country'].str.contains(  
    "Russian"), 'Country'] = 'Russia'
# Syrian Arab Republic
data.loc[data['Country'].str.contains(  
    "Syrian"), 'Country'] = 'Syria'
# United Kingdom of Great Britain and Northern Ireland
data.loc[data['Country'].str.contains(  
    "United Kingdom"), 'Country'] = 'United Kingdom'
# United Republic of Tanzania
data.loc[data['Country'].str.contains(  
    "Tanzania"), 'Country'] = 'Tanzania'
# United States of America
data.loc[data['Country'].str.contains(
    "United States"), 'Country'] = 'United States'
# Bolivarian Republic of Venezuela
data.loc[data['Country'].str.contains(  
    "Venezuela"), 'Country'] = 'Venezuela'
# Viet Nam
data.loc[data['Country'].str.contains(
    "Viet"), 'Country'] = 'Vietnam'

data


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,,,,
1,Albania,,,,
2,Algeria,,,,
3,Andorra,,,,
4,Angola,,,,
...,...,...,...,...,...
190,Vatican City,,,,
191,Vietnam,,,,
192,Yemen,,,,
193,Zambia,,,,


# Africa

In [5]:
url = 'https://en.wikipedia.org/wiki/Decolonisation_of_Africa'

In [6]:
# Gets table containing the dates and colonial powers of african countries
df = get_dataframe(get_tables(url)[6])

#
df['Continent'] = 'Africa'

# Cleaning column 'Country'
df['Country'] = df['Country'].str.replace('\xa0', '')

df


Unnamed: 0,Country,Date of acquisition of sovereignty,Acquisition of sovereignty,Continent
0,Algeria,3 July 1962,French recognition of Algerian referendum on i...,Africa
1,Angola,11 November 1975,Independence from Portugal,Africa
2,Benin,1 August 1960,Independence from France,Africa
3,Botswana,30 September 1966,Independence from the United Kingdom,Africa
4,Burkina Faso,5 August 1960,Independence from France,Africa
...,...,...,...,...
58,Uganda,1 March 1962,Self-government granted,Africa
59,Uganda,9 October 1962,Independence from the United Kingdom,Africa
60,Zambia,24 October 1964,Independence from the United Kingdom,Africa
61,Zimbabwe,11 November 1965,Unilateral declaration of independence by Sout...,Africa


In [7]:
# Rename columns to fit with the 'data' dataframe
df = df.rename(columns={"Date of acquisition of sovereignty": 'Independence',
                        'Acquisition of sovereignty': 'Independence From'})

# Independence from date to just year
df['Independence'] = df['Independence'].str.split(expand=True)[2]

# Removes duplicates
df = df.drop_duplicates(
    ['Country'], keep='last').reset_index(drop=True)

# Get some colonial powers
df['Independence From'] = df['Independence From'].str.replace(
    "Independence From|the|declared|recognized", "", case=False, regex=True)

df['Independence From'] = df['Independence From'].str.strip()


In [8]:
# Corrections
# Names
# Democratic Republic of Congo -> Democratic Republic of the Congo
df.iloc[11, 0] = "Democratic Republic of the Congo"

# Republic of Congo -> Republic of the Congo
df.iloc[12, 0] = "Republic of the Congo"

# Ivory Coast -> Côte d'Ivoire
df.iloc[24, 0] = "Côte d'Ivoire"

# Independence From
# Algeria, Madagascar, Morocco
df.iloc[[0, 29, 34], 2] = 'France'

# Egypt, South Africa, Sudan, Tanzania, Zimbabwe
df.iloc[[14, 45, 47, 48, 53], 2] = 'United Kingdom'

# Ethiopia
df.iloc[18, [1, 2]] = 'Not Colonized'

# Liberia (Colonized by "American Colonization Society")
df.iloc[27]['Independence From'] = 'United States'

# Libya, Somalia
df.iloc[[28, 44], 2] = 'Italy'

# Namibia
df.iloc[36]['Independence From'] = 'South Africa'

# South Sudan
df.iloc[46]['Independence From'] = 'Sudan'

# Dates
# Libya (Declared: 1947 | Kingdom established: 1951)
# df.iloc[28]['Independence'] = 1947

# Malawi (Dominion: 1964 | Republic: 1966)
df.iloc[30]['Independence'] = 1966

# Mauritius (Elizabeth II as head of state: 1968 - 1992 | Republic: 1992)
# df.iloc[33]['Independence'] = 1992

# Morocco (End of the French Protectorate announced: 1955 | Declaration: 1956)
df.iloc[34]['Independence'] = 1956

# Sierra Leone (Dominion: 1961 | Republic: 1971)
df.iloc[43]['Independence'] = 1971

# South Africa (Statute of Westminster: 1931 | Republic: 1961)
df.iloc[45]['Independence'] = 1961

# Tanzania (Tanganyika: 1961 | Zanzibar: 1963 | Merger: 1964)
# df.iloc[48]['Independence'] = 1964

# Uganda (Dominion: 1962 | Republic: 1963)
df.iloc[51]['Independence'] = 1963

df


Unnamed: 0,Country,Independence,Independence From,Continent
0,Algeria,1962,France,Africa
1,Angola,1975,Portugal,Africa
2,Benin,1960,France,Africa
3,Botswana,1966,United Kingdom,Africa
4,Burkina Faso,1960,France,Africa
5,Burundi,1962,Belgium,Africa
6,Cabo Verde,1975,Portugal,Africa
7,Cameroon,1960,France,Africa
8,Central African Republic,1960,France,Africa
9,Chad,1960,France,Africa


In [9]:
# Filling the main dataframe with collected data
map_fill(data, df, 'Country', ['Independence', 'Independence From', 'Continent'])

data


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,,,,
1,Albania,,,,
2,Algeria,1962,France,,Africa
3,Andorra,,,,
4,Angola,1975,Portugal,,Africa
...,...,...,...,...,...
190,Vatican City,,,,
191,Vietnam,,,,
192,Yemen,,,,
193,Zambia,1964,United Kingdom,,Africa


## Main Colonial Power

In [10]:
# Gets table containing the dates and colonial powers of african countries
df = get_dataframe(get_tables(url)[0])[['Country', 'Colonial power']]
df = df.rename(columns={'Colonial power': 'Main Colonial Power'})

# Cleaning column 'Country'
df['Country'] = df['Country'].str.replace('\xa0', '')

df


Unnamed: 0,Country,Main Colonial Power
0,Liberia,United States
1,South Africa,United Kingdom
2,Egypt,United Kingdom
3,Ethiopian Empire,Kingdom of Italy United Kingdom
4,Eritrea,Italy
5,Emirate of Cyrenaica,United Kingdom
6,United Kingdom of Libya,United Kingdom French Fourth Republic Emirate...
7,Libya,Italy United Kingdom
8,Sudan,United Kingdom[q] Republic of Egypt
9,South Sudan,United Kingdom[q] Republic of Egypt


In [11]:
# Dropping unrecognized countries and countries that no longer exist
df = df.drop([5, 6, 21, 39, 57]).reset_index(drop=True)


In [12]:
df

Unnamed: 0,Country,Main Colonial Power
0,Liberia,United States
1,South Africa,United Kingdom
2,Egypt,United Kingdom
3,Ethiopian Empire,Kingdom of Italy United Kingdom
4,Eritrea,Italy
5,Libya,Italy United Kingdom
6,Sudan,United Kingdom[q] Republic of Egypt
7,South Sudan,United Kingdom[q] Republic of Egypt
8,Tunisia,France United Kingdom
9,Morocco,France Spain


In [13]:
# Corrections
# Names
df.iloc[3]['Country'] = 'Ethiopia'
df.iloc[22]['Country'] = "Côte d'Ivoire"
df.iloc[30]['Country'] = 'Tanzania'
df.iloc[38]['Country'] = 'Gambia'
df.iloc[39]['Country'] = 'Zimbabwe'
df.iloc[47]['Country'] = 'Cabo Verde'

# Powers
# Ethiopia
df.iloc[3]['Main Colonial Power'] = 'Not Colonized'
# Libya and Somalia
df.iloc[[5, 18], 1] = 'Italy'

# Sudan, South Sudan, and Namibia
df.iloc[[6, 7, 53], 1] = 'United Kingdom'

# Tunisia and Morocco
df.iloc[[8, 9], 1] = 'France'

# Cameroon, Burundi, and Ruanda
df.iloc[[12, 31, 32], 1] = 'Germany'

df['Main Colonial Power'] = df['Main Colonial Power'].str.strip()

df


Unnamed: 0,Country,Main Colonial Power
0,Liberia,United States
1,South Africa,United Kingdom
2,Egypt,United Kingdom
3,Ethiopia,Not Colonized
4,Eritrea,Italy
5,Libya,Italy
6,Sudan,United Kingdom
7,South Sudan,United Kingdom
8,Tunisia,France
9,Morocco,France


In [14]:
# Filling main dataframe
map_fill(data, df, 'Country', ['Main Colonial Power'])
data

Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,,,,
1,Albania,,,,
2,Algeria,1962,France,France,Africa
3,Andorra,,,,
4,Angola,1975,Portugal,Portugal,Africa
...,...,...,...,...,...
190,Vatican City,,,,
191,Vietnam,,,,
192,Yemen,,,,
193,Zambia,1964,United Kingdom,United Kingdom,Africa


# America

In [15]:
url = 'https://en.wikipedia.org/wiki/Decolonization_of_the_Americas'


In [16]:
# Gets table containing the dates and colonial powers of american countries
df = get_dataframe(get_tables(url)[0])[
    ['Country', 'Colonial power', 'Independence date']]
df['Independence From'] = np.nan
df =  df.sort_values('Country').reset_index(drop=True)

# Six continents model
df['Continent'] = 'America'

# Cleaning column 'Country'
df['Country'] = df['Country'].str.replace('\xa0', '')

df


Unnamed: 0,Country,Colonial power,Independence date,Independence From,Continent
0,Antigua and Barbuda,United Kingdom,"November 1, 1981",,America
1,Argentina,Spanish Empire,"May 25, 1810; July 9, 1816",,America
2,Bahamas,United Kingdom,"July 10, 1973",,America
3,Barbados,United Kingdom,"November 30, 1966",,America
4,Belize,United Kingdom,"September 21, 1981",,America
5,Bolivia,Spanish Empire,"August 6, 1825",,America
6,Brazil,Portuguese Empire,"August 29, 1825",,America
7,Canada,United Kingdom,"July 1, 1867",,America
8,Chile,Spanish Empire,"February 12, 1818",,America
9,Colombiaas part of Gran Colombia,Spanish Empire,"August 7, 1819",,America


In [17]:
# Rename columns to fit with the 'data' dataframe
df = df.rename(columns={"Independence date": 'Independence',
                                  'Colonial power': 'Main Colonial Power'})

# Independence from date to just year
df['Independence'] = df['Independence'].str.split(expand=True)[2]
df['Independence'] = df['Independence'].str.replace(';', '')

# Dropping second Dominican Republic independence
df = df.drop(13).reset_index(drop=True)

df


Unnamed: 0,Country,Main Colonial Power,Independence,Independence From,Continent
0,Antigua and Barbuda,United Kingdom,1981,,America
1,Argentina,Spanish Empire,1810,,America
2,Bahamas,United Kingdom,1973,,America
3,Barbados,United Kingdom,1966,,America
4,Belize,United Kingdom,1981,,America
5,Bolivia,Spanish Empire,1825,,America
6,Brazil,Portuguese Empire,1825,,America
7,Canada,United Kingdom,1867,,America
8,Chile,Spanish Empire,1818,,America
9,Colombiaas part of Gran Colombia,Spanish Empire,1819,,America


In [18]:
# Obtaining the correct name for some countries 
df['Country'] = df['Country'].str.replace(
    "as part of|Federal Republic of Central America|Gran Colombia", "", case=False, regex=True)

df['Country'] = df['Country'].str.strip()

# Get some colonial powers
df['Main Colonial Power'] = df['Main Colonial Power'].str.replace(
    " Empire", "", case=False, regex=True)

df['Main Colonial Power'] = df['Main Colonial Power'].str.strip()

df['Main Colonial Power'] = df['Main Colonial Power'].str.replace(
    "British", "United Kingdom", case=False)
df['Main Colonial Power'] = df['Main Colonial Power'].str.replace(
    "Spanish", "Spain", case=False)
df['Main Colonial Power'] = df['Main Colonial Power'].str.replace(
    "Portuguese", "Portugal", case=False)

df['Main Colonial Power'] = df['Main Colonial Power'].str.strip()


In [19]:
# Corrections
df['Independence From'] = df['Main Colonial Power']

# Argentina (Declared)
df.iloc[1, 2] = 1816

# Bolivia (Declared: 1825 | Recognized: 1847)
# df.iloc[5, 2] = 1825

# Brazil (Declared: 1822 | Recognized: 1825)
df.iloc[6, 2] = 1822

# Canada (Confederation: 1867 | Westminster: 1931 | Patriation: 1982)
df.iloc[7, 2] = 1982

# Chile (Declared: 1818 | Recognized: 1844)
# df.iloc[8, 2] = 1818

# Colombia (Declared: 1810 | Recognized: 1819)
df.iloc[9, 2] = 1810

# Cuba
df.iloc[11, 2] = 1902
df.iloc[11, 3] = 'United States'

# Dominican Republic (4 times)
# From Spain in 1821 | From Haiti in 1844
# From Spain in 1865 | From USA in 1924
# df.iloc[13, 3] = 'Haiti'
# df.iloc[13, 3] = 'United States'

# Haiti
# Occupied by USA from 1915 to 1934
df.iloc[19, 1] = 'France'
df.iloc[19, 3] = 'France'

# Panama (From Spain: 1821 | From Colombia: 1903)
df.iloc[24, 2] = 1903
df.iloc[24, 3] = 'Colombia'

# Uruguai
df.iloc[33, 1] = 'Spain'
df.iloc[33, 3] = 'Brazil'

df


Unnamed: 0,Country,Main Colonial Power,Independence,Independence From,Continent
0,Antigua and Barbuda,United Kingdom,1981,United Kingdom,America
1,Argentina,Spain,1816,Spain,America
2,Bahamas,United Kingdom,1973,United Kingdom,America
3,Barbados,United Kingdom,1966,United Kingdom,America
4,Belize,United Kingdom,1981,United Kingdom,America
5,Bolivia,Spain,1825,Spain,America
6,Brazil,Portugal,1822,Portugal,America
7,Canada,United Kingdom,1982,United Kingdom,America
8,Chile,Spain,1818,Spain,America
9,Colombia,Spain,1810,Spain,America


In [20]:
# Filling main dataframe
map_fill(data, df, 'Country', [
         'Independence', 'Independence From', 'Main Colonial Power', 'Continent'])

data


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,,,,
1,Albania,,,,
2,Algeria,1962,France,France,Africa
3,Andorra,,,,
4,Angola,1975,Portugal,Portugal,Africa
...,...,...,...,...,...
190,Vatican City,,,,
191,Vietnam,,,,
192,Yemen,,,,
193,Zambia,1964,United Kingdom,United Kingdom,Africa


# Asia

In [21]:
url = 'https://en.wikipedia.org/wiki/Decolonisation_of_Asia'


In [22]:
# Gets table containing the dates and colonial powers of asian countries
df = get_dataframe(get_tables(url)[0])

# Rename columns to fit with the 'data' dataframe
df['Main Colonial Power'] = None
df = df.rename(columns={"Date of acquisition of sovereignty": 'Independence',
                        'Acquisition of sovereignty': 'Independence From'})

# 
df['Independence'] = df['Independence'].str.split(expand=True)[2]
df.iloc[7, 0] = 'China'

# Remove duplicates
df = df.drop_duplicates(['Country'], keep='last').reset_index(drop = True)

#
df['Continent'] = 'Asia'

# Cleaning column 'Country'
df['Country'] = df['Country'].str.replace('\xa0', '')

df


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,,Treaty of Rawalpindi ends British control of f...,,Asia
1,Bahrain,1971,End of treaties with the United Kingdom,,Asia
2,Bangladesh,1971,Independence from Pakistan declared,,Asia
3,Bhutan,,Ugyen Wangchuck ends a period of civil war and...,,Asia
4,Brunei,1984,Brunei regains its independence after an agree...,,Asia
5,Cambodia,1989,Becomes free from Vietnamese occupation; it ge...,,Asia
6,China,,"In 221 BC, Qin Shi Huang conquered the various...",,Asia
7,India,1947,Independence from the British Empire,,Asia
8,Indonesia,1949,Independence from the Kingdom of the Netherlan...,,Asia
9,Iran,,After the fall of Assyria between 616 BC and 6...,,Asia


In [23]:
# Get some colonial powers
df['Independence From'] = df['Independence From'].str.replace(
    "Independence From|independence from | independence from|the| declared| recognized|recognized |End of treaties with ",
    "", case=False, regex=True)

df['Independence From'] = df['Independence From'].str.strip()

df.loc[df['Independence From'].str.contains(
    "British|United Kingdom"), ['Independence From', 'Main Colonial Power']] = 'United Kingdom'

df.loc[df['Independence From'].str.contains(
    "France|French"), ['Independence From', 'Main Colonial Power']] = 'France'

df.loc[df['Independence From'].str.contains(
    "Soviet"), 'Main Colonial Power'] = 'Russia'

df.loc[df['Independence From'].str.contains(
    "Netherlands|Nerlands"), ['Independence From', 'Main Colonial Power']] = 'Netherlands'


In [24]:
# Corrections
# Afghanistan
df.iloc[0, 1] = 1919

# Bangladesh
df.iloc[2, 3] = 'United Kingdom'

# Bhutan, Japan, China, Iran, Nepal, Saudi Arabia, Thailand
df.iloc[[3, 6, 9, 12, 22, 28, 33], [1, 2, 3]] = 'Not Colonized'

# Cambodia
df.iloc[5, 1] = 1953

# Kuwait
df.iloc[14, 1] = 1961

# Malaysia
df.iloc[18, 1] = 1963

# Mongolia
df.iloc[20, 2] = 'China'
df.iloc[20, 3] = 'Not Colonized'

# Oman
df.iloc[23, 1] = 1970

# Philippines
df.iloc[26, 2] = 'United States'
df.iloc[26, 3] = 'Spain'

# Qatar
df.iloc[27, 1] = 1971

# Singapore
df.iloc[29, 1] = 1965
df.iloc[29, 2] = 'Malaysia'
df.iloc[29, 3] = 'United Kingdom'

# Syria
df.iloc[31, 1] = 1946

# Timor-Leste (From Portugal: 1975 | From Indonesia: 2002)
df.iloc[34, 2] = 'Indonesia'
df.iloc[34, 3] = 'Portugal'

# Vietnam
df.iloc[38, 1] = 1945

# Yemen (North: 1918 | South: 1967 | Unification: 1990)
# df.iloc[39, 1] = 1990

#
# Iraq, Kuwait, Malaysia, Oman, Palestine, Qatar
df.iloc[[10, 14, 18, 23, 25, 27], [2, 3]] = 'United Kingdom'

# Cambodia, Syria, Vietnam
df.iloc[[5, 31, 38], [2, 3]] = 'France'

df['Main Colonial Power'] = df['Main Colonial Power'].fillna(df['Independence From'])


In [25]:
df


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,1919,United Kingdom,United Kingdom,Asia
1,Bahrain,1971,United Kingdom,United Kingdom,Asia
2,Bangladesh,1971,Pakistan,United Kingdom,Asia
3,Bhutan,Not Colonized,Not Colonized,Not Colonized,Asia
4,Brunei,1984,United Kingdom,United Kingdom,Asia
5,Cambodia,1953,France,France,Asia
6,China,Not Colonized,Not Colonized,Not Colonized,Asia
7,India,1947,United Kingdom,United Kingdom,Asia
8,Indonesia,1949,Netherlands,Netherlands,Asia
9,Iran,Not Colonized,Not Colonized,Not Colonized,Asia


In [26]:
# Filling main dataframe
map_fill(data, df, 'Country', ['Independence',
         'Independence From', 'Main Colonial Power', 'Continent'])

data


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,1919,United Kingdom,United Kingdom,Asia
1,Albania,,,,
2,Algeria,1962,France,France,Africa
3,Andorra,,,,
4,Angola,1975,Portugal,Portugal,Africa
...,...,...,...,...,...
190,Vatican City,,,,
191,Vietnam,1945,France,France,Asia
192,Yemen,1967,United Kingdom,United Kingdom,Asia
193,Zambia,1964,United Kingdom,United Kingdom,Africa


# Oceania

In [27]:
url = 'https://en.wikipedia.org/wiki/Decolonisation_of_Oceania'

In [28]:
# # Another possible source/table
# df = get_dataframe(get_tables(url)[0], ['td', 'th'], {'scope': 'col'})[['Country', 'Colonial power', 'Independence date']]

# df['Independence From'] = None
# df = df.rename(columns={'Independence date': 'Independence',
#                         'Colonial power': 'Main Colonial Power'})

# df['Independence'] = df['Independence'].str.split(expand=True)[2]

# df


In [29]:
# Gets table
df = get_dataframe(get_tables(url)[1])[['Country', 'Date of acquisition of sovereignty', 'Acquisition of sovereignty']]

# Rename columns to fit with the 'data' dataframe
df['Main Colonial Power'] = None
df = df.rename(columns={"Date of acquisition of sovereignty": 'Independence',
                        'Acquisition of sovereignty': 'Independence From'})

df['Independence'] = df['Independence'].str.split(expand=True)[2]

# Remove duplicates
df = df.drop_duplicates(['Country'], keep='last').reset_index(drop=True)

# Or 'Australia'
df['Continent'] = 'Oceania'

# Cleaning column 'Country'
df['Country'] = df['Country'].str.replace('\xa0', '')


df


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Australia,1986,Australia Act 1986 – Remaining legal ties betw...,,Oceania
1,Fiji,1970,Independence from the United Kingdom,,Oceania
2,Kiribati,1979,Independence from the United Kingdom,,Oceania
3,Marshall Islands,1986,Compact of Free Association with the United St...,,Oceania
4,Federated States of Micronesia,1986,Compact of Free Association with the United St...,,Oceania
5,Nauru,1968,"Independence from UN Trusteeship (Australian, ...",,Oceania
6,New Zealand,1967,Governor-General becomes a New Zealand appoint...,,Oceania
7,Palau,1994,Emerged from United Nations trusteeship (admin...,,Oceania
8,Papua New Guinea,1975,Independence from Australia,,Oceania
9,Samoa,1962,Independence from New Zealand,,Oceania


In [30]:
# Get some colonial powers
df['Independence From'] = df['Independence From'].str.replace(
    "Independence From|the | declared| recognized| recognised|End of treaties with ",
    "", case=False, regex=True)

df['Independence From'] = df['Independence From'].str.strip()

df.loc[df['Independence From'].str.contains(
    "British|UK|United Kingdom"), ['Independence From', 'Main Colonial Power']] = 'United Kingdom'


In [31]:
# Corrections
# Marshall Islands, Federated States of Micronesia, Palau
df.iloc[[3, 4, 7], 2] = 'United States'
df.iloc[[3, 4, 7], 3] = 'Spain'

# New Zealand (Westminster: 1947)
df.iloc[6, 1] = 1947
df.iloc[6, [2, 3]] = 'United Kingdom'

# Papua New Guinea, Samoa
df.iloc[[8, 9], 3] = 'Germany'

In [32]:
df

Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Australia,1986,United Kingdom,United Kingdom,Oceania
1,Fiji,1970,United Kingdom,United Kingdom,Oceania
2,Kiribati,1979,United Kingdom,United Kingdom,Oceania
3,Marshall Islands,1986,United States,Spain,Oceania
4,Federated States of Micronesia,1986,United States,Spain,Oceania
5,Nauru,1968,United Kingdom,United Kingdom,Oceania
6,New Zealand,1947,United Kingdom,United Kingdom,Oceania
7,Palau,1994,United States,Spain,Oceania
8,Papua New Guinea,1975,Australia,Germany,Oceania
9,Samoa,1962,New Zealand,Germany,Oceania


In [33]:
# Filling main dataframe
map_fill(data, df, 'Country', ['Independence',
         'Independence From', 'Main Colonial Power', 'Continent'])

data


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,1919,United Kingdom,United Kingdom,Asia
1,Albania,,,,
2,Algeria,1962,France,France,Africa
3,Andorra,,,,
4,Angola,1975,Portugal,Portugal,Africa
...,...,...,...,...,...
190,Vatican City,,,,
191,Vietnam,1945,France,France,Asia
192,Yemen,1967,United Kingdom,United Kingdom,Asia
193,Zambia,1964,United Kingdom,United Kingdom,Africa


# Last Corrections

In [34]:
data

Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,1919,United Kingdom,United Kingdom,Asia
1,Albania,,,,
2,Algeria,1962,France,France,Africa
3,Andorra,,,,
4,Angola,1975,Portugal,Portugal,Africa
...,...,...,...,...,...
190,Vatican City,,,,
191,Vietnam,1945,France,France,Asia
192,Yemen,1967,United Kingdom,United Kingdom,Asia
193,Zambia,1964,United Kingdom,United Kingdom,Africa


## Europe

In [35]:
url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_by_date_of_formation'


In [36]:
# Gets table
df = get_dataframe(get_tables(url)[3])[
    ['Country', 'Date of acquisition of sovereignty', 'Acquisition of sovereignty']]

# Rename columns to fit with the 'data' dataframe
df['Main Colonial Power'] = None
df = df.rename(columns={"Date of acquisition of sovereignty": 'Independence',
                        'Acquisition of sovereignty': 'Independence From'})

#
df['Independence'] = df['Independence'].str.split(expand=True)[2]

# Remove duplicates
df = df.drop_duplicates(['Country'], keep='last').reset_index(drop=True)

#
df['Continent'] = 'Europe'

# Cleaning column 'Country'
df['Country'] = df['Country'].str.replace('\xa0', '')

df


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Albania,1944.0,Albanian state re-established after Italian/Ge...,,Europe
1,Andorra,,Independence from Aragon,,Europe
2,Austria,1945.0,Restoration of the Republic of Austria,,Europe
3,Belarus,1991.0,Independence recognized by Soviet Union,,Europe
4,Belgium,1830.0,Independence was proclaimed by the provisonial...,,Europe
5,Bosnia and Herzegovina,1992.0,Independence declared from the SFR Yugoslavia,,Europe
6,Bulgaria,1908.0,Independence from Ottoman Empire,,Europe
7,Croatia,,Formation of Kingdom of Croatia by King Tomislav,,Europe
8,Czech Republic,,Creation of Czechoslovakia,,Europe
9,Denmark,,Harald Bluetooth unifies Denmark,,Europe


In [37]:
# 
df.iloc[8, 0] = 'Czechia'
df.iloc[28, 0] = 'North Macedonia'

In [38]:
#
df['Independence From'] = df['Independence From'].str.replace(
    "Independence From|the|declared|recognised|End of treaties with",
    "", case=False, regex=True)

df['Independence From'] = df['Independence From'].str.strip()

# 
df.loc[df['Independence From'].str.contains(
    "British|UK|United Kingdom"), ['Independence From', 'Main Colonial Power']] = 'United Kingdom'

# Former USSR
df.loc[df['Independence From'].str.contains(
    "Soviet"), 'Independence From'] = 'Soviet Union'

# Former Yugoslavia
df.loc[df['Independence From'].str.contains(
    "Yugoslavia"), 'Independence From'] = 'Yugoslavia'


In [39]:
# Corrections
# Albania
df.iloc[0, 1] = 1912

# Andorra
df.iloc[1, 1] = 1814
df.iloc[1, 2] = 'France'

# Austria
# Or 1920 from Austria-Hungary
df.iloc[2, 1] = 1945
df.iloc[2, 2] = 'Germany'

# Belgium (Declared: 1830 | Recognized: 1839)
# df.iloc[4, 1] = 1839

# Croatia
df.iloc[7, 1] = 1991
df.iloc[7, 2] = 'Yugoslavia'

# Czechoslovakia -> Czechia & Slovakia
# Or 1918 from Austria-Hungary
df.iloc[[8, 35], 1] = 1993
df.iloc[[8, 35], 2] = 'Czechoslovakia'

# Greece (Declared: 1822 | Recognized: 1830)
df.iloc[14, 1] = 1822

# Hungary
df.iloc[15, 1] = 1920
df.iloc[15, 2] = 'Austria-Hungary' # ?

# Iceland ('Independent' in a personal union: 1918 | Republic: 1944)
# df.iloc[16, 1] = 1918
df.iloc[16, 3] = 'Norway'

# Lithuania
df.iloc[21, 1] = 1990

# Luxembourg
df.iloc[22, 1] = 1890

# Monaco
df.iloc[25, 1] = 1814
df.iloc[25, 2] = 'France'

# Netherlands
df.iloc[27, 1] = 1581

# Norway
df.iloc[29, 1] = 1905
df.iloc[29, 2] = 'Sweden'

# Poland 
# Kingdom: 1916/1917 (From Germany)
# Second Republic: 1919 (From Germany | Treaty of Versailles)
# Post-War: 1947 (From Germany)
# Third Republic: 1989 (From ?)
df.iloc[30, 1] = 1918
df.iloc[30, 2] = 'Germany'

# Portugal
# First: 1143 from Leon
# Portuguese Restoration War: 1640
# End of the Iberian Union: 1668
df.iloc[31, 1] = 1668

# Romania
df.iloc[32, 1] = 1878

# Serbia and Montenegro -> Serbia (34) and Montenegro (26)
# For Serbia
# Treaty of Berlin: 1878 from the Ottoman Empire
# Yugoslavia: 1918 from Austria-Hungary
# Dissolution of Serbia and Montenegro: 2006
df.iloc[34, 1] = 2006
df.iloc[[26, 34], 2] = 'Serbia and Montenegro' # Or 'Serbia' or 'Yugoslavia'?

# Sweden
df.iloc[38, 1] = 1523

# Ukraine
df.iloc[40, 1] = 1991

# Vatican
df.iloc[42, 1] = 1929
df.iloc[42, 2] = 'Italy'

# From the Ottoman Empire
# Albania, Greece, Romania
df.iloc[[0, 14, 32], 2] = 'Ottoman Empire'

# From the Netherlands
# Belgium, Luxembourg
df.iloc[[4, 22], 2] = 'Netherlands'

# From Denmark
# Iceland, Sweden
df.iloc[[16, 38], 2] = 'Denmark'

# From the Soviet Union
# Lithuania, Ukraine
df.iloc[[21, 40], 2] = 'Soviet Union'

# From Spain
# Netherlands, Portugal
df.iloc[[27, 31], 2] = 'Spain'

# Not Colonized
df.iloc[[9, 12, 13, 18, 20, 37, 39, 41], [1, 2]] = 'Not Colonized'
df['Main Colonial Power'] = df['Main Colonial Power'].fillna('Not Colonized')


In [40]:
df

Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Albania,1912,Ottoman Empire,Not Colonized,Europe
1,Andorra,1814,France,Not Colonized,Europe
2,Austria,1945,Germany,Not Colonized,Europe
3,Belarus,1991,Soviet Union,Not Colonized,Europe
4,Belgium,1830,Netherlands,Not Colonized,Europe
5,Bosnia and Herzegovina,1992,Yugoslavia,Not Colonized,Europe
6,Bulgaria,1908,Ottoman Empire,Not Colonized,Europe
7,Croatia,1991,Yugoslavia,Not Colonized,Europe
8,Czechia,1993,Czechoslovakia,Not Colonized,Europe
9,Denmark,Not Colonized,Not Colonized,Not Colonized,Europe


In [41]:
map_fill(data, df, 'Country', ['Independence',
         'Independence From', 'Main Colonial Power', 'Continent'])

data


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
0,Afghanistan,1919,United Kingdom,United Kingdom,Asia
1,Albania,1912,Ottoman Empire,Not Colonized,Europe
2,Algeria,1962,France,France,Africa
3,Andorra,1814,France,Not Colonized,Europe
4,Angola,1975,Portugal,Portugal,Africa
...,...,...,...,...,...
190,Vatican City,1929,Italy,Not Colonized,Europe
191,Vietnam,1945,France,France,Asia
192,Yemen,1967,United Kingdom,United Kingdom,Asia
193,Zambia,1964,United Kingdom,United Kingdom,Africa


# Finishing

In [42]:
# Data where any column is not filled
data[(data['Independence'].isnull()) |
     (data['Independence From'].isnull()) |
     (data['Main Colonial Power'].isnull()) |
     (data['Continent'].isnull())
     ]


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
7,Armenia,,,,
10,Azerbaijan,,,,
42,Cyprus,,,,
45,North Korea,,,,
65,Georgia,,,,
88,Kazakhstan,,,,
141,South Korea,,,,
144,Russia,,,,
180,Türkiye,,,,


In [43]:
# Koreas
data.loc[data['Country'].str.contains("Korea"), 'Independence'] = 1948
data.loc[data['Country'].str.contains("Korea"), 'Independence From'] = 'Japan'
data.loc[data['Country'].str.contains("Korea"), 'Main Colonial Power'] = 'Not Colonized'


In [44]:
# Armenia, Azerbaijan, Georgia, Kazakhstan
data.iloc[[7, 10, 63, 86], 1] = 1991
data.iloc[[7, 10, 63, 86], 2] = 'Soviet Union'
data.iloc[[10, 63, 86], 3] = 'Russia'

# Cyprus
data.iloc[41, 1] = 1960
data.iloc[41, [2, 3]] = 'United Kingdom'

# Armenia, Russia, Turkiye
data.iloc[[142, 180], [1, 2]] = 'Not Colonized'
data.iloc[[7, 142, 180], 3] = 'Not Colonized'


In [45]:
# Continents (Main continent)
# Asia (Armenia, Azerbaijan, Cyprus, Georgia, Kazakhstan, Koreas, Turkiye)
data.iloc[[7, 10, 41, 63, 86, 125, 160, 180], 4] = 'Asia'

# Europe
data['Continent'] = data['Continent'].fillna('Europe')


In [46]:
data[(data['Independence'].isnull()) |
     (data['Independence From'].isnull()) |
     (data['Main Colonial Power'].isnull()) |
     (data['Continent'].isnull())
     ]


Unnamed: 0,Country,Independence,Independence From,Main Colonial Power,Continent
42,Cyprus,,,,Europe
65,Georgia,,,,Europe
88,Kazakhstan,,,,Europe
144,Russia,,,,Europe


In [47]:
data.to_csv('../CSV/colonialism.csv')