In [1]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

# Download spaCy English model
!python -m spacy download en_core_web_sm

# Load spacy English module
NER = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------ --------------------------------- 2.1/12.8 MB 14.7 MB/s eta 0:00:01
     --------- ------------------------------ 2.9/12.8 MB 8.0 MB/s eta 0:00:02
     ------------ --------------------------- 3.9/12.8 MB 6.9 MB/s eta 0:00:02
     -------------- ------------------------- 4.7/12.8 MB 6.2 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 5.7 MB/s eta 0:00:02
     ----------------------- ---------------- 7.6/12.8 MB 6.3 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 9.4 MB/s  0:00:01
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Load your scraped text
with open('Key_Events_Of_The_20th_Century_CLEAN.txt', 'r', errors='ignore') as file: 
    data = file.read().replace('\n', ' ')

book = NER(data)

In [3]:
# Show a sample of text
with open('Key_Events_Of_The_20th_Century_CLEAN.txt', 'r', errors='ignore') as f:
    data = f.read()
    # Count unusual characters
special_chars = re.findall(r'[^A-Za-z0-9.,;:\'\"!?()\s]', data)
set(special_chars)  # unique special characters in the text

{'&',
 '-',
 '/',
 '=',
 '^',
 '_',
 '|',
 '¡',
 '¢',
 '£',
 '¦',
 '§',
 '¨',
 '©',
 '¬',
 '\xad',
 '®',
 '¯',
 '°',
 '±',
 '²',
 '¶',
 '¸',
 '¹',
 'º',
 '¼',
 '½',
 '¾',
 'Â',
 'Ã',
 'Å',
 'Æ',
 'â',
 'š',
 'ž',
 'ƒ',
 '’',
 '‚',
 '„',
 '†',
 '…',
 '€',
 '™'}

## A lot of extra characters and white space present along with headings that aren't needed.

In [4]:
# removing extra characters

data = data.replace("–", "-").replace("“", '"').replace("”", '"').replace("…", "...")

In [5]:
# Removing edit markers and "Main article:" references
data = re.sub(r'\[edit\]', '', data)
data = re.sub(r'Main article:.*\n', '', data)

In [6]:
# Removing navigation/boilerplate keywords
boilerplate = ["Jump to content", "Main menu", "move to sidebar", "hide", 
               "Donate", "Create account", "Log in", "Tools", "Print/export", 
               "References", "External links"]
for term in boilerplate:
    data = data.replace(term, "")

In [7]:
# Normalizing common Unicode mis-encodings
data = data.replace("â€", "-").replace("\u200b", "")

In [8]:
# Standardizing white space

data = re.sub(r'\s+', ' ', data).strip()

In [9]:
# saving cleaned version

with open('Key_Events_Of_The_20th_Century_CLEAN.txt', 'w', encoding='utf-8') as f:
    f.write(data)

In [10]:
# loading the countries CSV

countries_df = pd.read_csv("countries_list_20th_century_1.5.csv")

In [11]:
# checking for column name

countries_df.head()

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola


In [12]:
# all of the following is cleaning to make sure the missing countries syntax runs correctly

country_list = list(countries_df['country_name'])

In [13]:
# Remove leading/trailing spaces
country_list = [c.strip() for c in country_list]

In [14]:
country_list = [c.replace(" ,", ",").replace(" , ", ", ").replace("  ", " ") for c in country_list]

In [15]:
country_list = list(set(country_list))

In [16]:
# list of all countries in CSV that do not appear in the text

missing_countries = [c for c in country_list if c not in data]
missing_countries

['Suriname',
 'Kiribati',
 'Myanmar',
 'Tanzania',
 'Slovenia',
 'Vatican City (Holy See)',
 'Qatar',
 'Malta',
 'Mali',
 'South Sudan',
 'Tuvalu',
 'Taiwan',
 'Trinidad and Tobago',
 'Sierra Leone',
 'Iceland',
 'Andorra',
 'El Salvador',
 'Ivory Coast',
 'Mauritania',
 'Georgia',
 'Brunei',
 'Korea, South',
 'Tonga',
 'Togo',
 'Costa Rica',
 'Burundi',
 'Jordan',
 'Jamaica',
 'Croatia',
 'Guatemala',
 'Benin',
 'New Zealand',
 'Samoa',
 'Brazil',
 'Saudi Arabia',
 'Peru',
 'Somaliland',
 'Tajikistan',
 'East Timor',
 'United Arab Emirates',
 'Belize',
 'Serbia',
 'Venezuela',
 'Kyrgyzstan',
 'Saint Lucia',
 'Uruguay',
 'Oman',
 "Luhansk People's Republic",
 'South Ossetia',
 'Honduras',
 'Congo, Republic of the',
 'Marshall Islands',
 'Lesotho',
 'Mauritius',
 'Yemen',
 'Equatorial Guinea',
 'Ecuador',
 'Gambia, The',
 'Congo, Democratic Republic of the',
 'Monaco',
 'Burkina Faso',
 'Armenia',
 'Central African Republic',
 'São Tomé and Príncipe',
 'Colombia',
 'Uganda',
 'Nauru',
 

In [22]:
# Splitting the sentence entities from the NER object

df_sentences = []

for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents if ent.text in country_list]
    df_sentences.append({"sentence": sent.text, "country_entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [24]:
# Filtering the entities to only include ones with countries from the countries list

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [25]:
relationships = []

for i in range(len(df_sentences_filtered)):
    end_i = min(i+5, len(df_sentences_filtered)-1)
    char_list = sum(df_sentences_filtered.loc[i:end_i, 'country_entities'].tolist(), [])
    
    char_unique = [char_list[j] for j in range(len(char_list)) if j==0 or char_list[j] != char_list[j-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [26]:
# summarizing relationships

relationship_df = pd.DataFrame(relationships)

# Sort pairs so source-target ordering is consistent
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis=1), columns=['source','target'])

# Count number of co-occurrences
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], as_index=False).sum()

In [27]:
relationship_df.to_csv("20th_century_country_relationships.csv", index=False)