In [1]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

# Download spaCy English model
!python -m spacy download en_core_web_sm

# Load spacy English module
NER = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -------------- ------------------------- 4.7/12.8 MB 31.6 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 44.6 MB/s  0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Load your scraped text
with open('Key_Events_Of_The_20th_Century_CLEAN.txt', 'r', errors='ignore') as file: 
    data = file.read().replace('\n', ' ')

book = NER(data)

In [3]:
# Show a sample of text
with open('Key_Events_Of_The_20th_Century_CLEAN.txt', 'r', errors='ignore') as f:
    data = f.read()
    # Count unusual characters
special_chars = re.findall(r'[^A-Za-z0-9.,;:\'\"!?()\s]', data)
set(special_chars)  # unique special characters in the text

{'&',
 '-',
 '/',
 '=',
 '^',
 '_',
 '|',
 '¢',
 '£',
 '¦',
 '§',
 '¨',
 '©',
 '¬',
 '\xad',
 '®',
 '¯',
 '°',
 '±',
 '²',
 '¶',
 '¸',
 '¹',
 'º',
 '¼',
 '½',
 'Â',
 'Ã',
 'Å',
 'â',
 'ž',
 'ƒ',
 '˜',
 '‘',
 '’',
 '‚',
 '“',
 '”',
 '„',
 '€',
 '™'}

## A lot of extra characters and white space present along with headings that aren't needed.

In [4]:
# removing extra characters

data = data.replace("–", "-").replace("“", '"').replace("”", '"').replace("…", "...")

In [5]:
# Removing edit markers and "Main article:" references
data = re.sub(r'\[edit\]', '', data)
data = re.sub(r'Main article:.*\n', '', data)

In [6]:
# Removing navigation/boilerplate keywords
boilerplate = ["Jump to content", "Main menu", "move to sidebar", "hide", 
               "Donate", "Create account", "Log in", "Tools", "Print/export", 
               "References", "External links"]
for term in boilerplate:
    data = data.replace(term, "")

In [7]:
# Normalizing common Unicode mis-encodings
data = data.replace("â€", "-").replace("\u200b", "")

In [8]:
# Standardizing white space

data = re.sub(r'\s+', ' ', data).strip()

In [9]:
# saving cleaned version

with open('Key_Events_Of_The_20th_Century_CLEAN.txt', 'w', encoding='utf-8') as f:
    f.write(data)

In [10]:
# loading the countries CSV

countries_df = pd.read_csv("countries_list_20th_century_1.5.csv")

In [11]:
# checking for column name

countries_df.head()

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola


In [12]:
# all of the following is cleaning to make sure the missing countries syntax runs correctly

country_list = list(countries_df['country_name'])

In [13]:
# Remove leading/trailing spaces
country_list = [c.strip() for c in country_list]

In [14]:
country_list = [c.replace(" ,", ",").replace(" , ", ", ").replace("  ", " ") for c in country_list]

In [15]:
country_list = list(set(country_list))

In [16]:
# list of all countries in CSV that do not appear in the text

missing_countries = [c for c in country_list if c not in data]
missing_countries

['Iceland',
 'Nicaragua',
 'Myanmar',
 'Brazil',
 'Ivory Coast',
 'Nepal',
 'Tonga',
 'Samoa',
 "China, People's Republic of",
 'North Macedonia',
 'Northern Cyprus',
 'United Arab Emirates',
 'Argentina',
 'Turkey',
 'Jordan',
 'Congo, Republic of the',
 'Taiwan',
 'Peru',
 'Belize',
 'Korea, North',
 'Colombia',
 'Serbia',
 'Bosnia and Herzegovina',
 'Slovakia',
 'Chile',
 'Montenegro',
 'Gambia, The',
 'Tajikistan',
 'Micronesia, Federated States of',
 'Tanzania',
 'Guatemala',
 'Lesotho',
 'Ethiopia',
 'Kazakhstan',
 'Slovenia',
 'South Ossetia',
 'Uruguay',
 'Brunei',
 'Kyrgyzstan',
 'Central African Republic',
 'Togo',
 'Malaysia',
 'Paraguay',
 'Kosovo',
 'Vatican City (Holy See)',
 'Georgia',
 'Nauru',
 'Burundi',
 'Grenada',
 'Kuwait',
 'Oman',
 'Vanuatu',
 'Bolivia',
 'Trinidad and Tobago',
 'Tuvalu',
 'Bahamas, The',
 'Cyprus',
 'Senegal',
 'Eritrea',
 'Bhutan',
 'Czech Republic',
 'Namibia',
 'Barbados',
 'Congo, Democratic Republic of the',
 'Transnistria',
 'Costa Rica',


In [18]:
import re

# ---- small alias map (extend as needed) ----
alias_map = {
    "united states of america": "United States",
    "usa": "United States",
    "u.s.": "United States",
    "us": "United States",
    "u.k.": "United Kingdom",
    "uk": "United Kingdom",
    "britain": "United Kingdom",
    "soviet union": "Russia",
    "ussr": "Russia",
    "republic of korea": "South Korea",
    "korea, republic of": "South Korea"
}

# create lower-case lookup for exact country_list matches
country_lookup = {c.lower(): c for c in country_list}

# ---- Build sentence-level DataFrame with normalized country entities ----
df_sentences = []

for sent in book.sents:
    entity_list = []
    for ent in sent.ents:
        # normalize entity text
        ent_text = ent.text.strip()
        ent_text = re.sub(r'^[Tt]he\s+', '', ent_text)            # remove leading "The "
        ent_text = ent_text.replace("’s", "").replace("'s", "")   # remove possessives
        ent_text = ent_text.strip('.,;:()[]"\'' )                 # strip surrounding punctuation
        key = ent_text.lower()

        # check alias map first, then exact country_list lookup
        if key in alias_map:
            canonical = alias_map[key]
            if canonical in country_list:
                entity_list.append(canonical)
        elif key in country_lookup:
            entity_list.append(country_lookup[key])

    df_sentences.append({"sentence": sent.text, "country_entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

# Filter to sentences that actually contain country entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0].reset_index(drop=True)

# quick sanity prints
print("Total sentences:", len(df_sentences))
print("Sentences with country mentions:", len(df_sentences_filtered))
df_sentences_filtered.head()

Total sentences: 1919
Sentences with country mentions: 188


Unnamed: 0,sentence,country_entities
0,After a period of diplomatic and military esca...,"[France, Austria]"
1,"In 1917, Russia ended hostile actions against ...",[Russia]
2,The Bolsheviks negotiated the Treaty of Brest-...,"[Germany, Russia]"
3,"In the treaty, Bolshevik Russia ceded the Balt...",[Germany]
4,It also recognized the independence of Ukraine.,[Ukraine]


In [19]:
# Filtering the entities to only include ones with countries from the countries list

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [20]:
relationships = []

for i in range(len(df_sentences_filtered)):
    end_i = min(i+5, len(df_sentences_filtered)-1)
    char_list = sum(df_sentences_filtered.loc[i:end_i, 'country_entities'].tolist(), [])
    
    char_unique = [char_list[j] for j in range(len(char_list)) if j==0 or char_list[j] != char_list[j-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [21]:
# summarizing relationships

relationship_df = pd.DataFrame(relationships)

# Sort pairs so source-target ordering is consistent
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis=1), columns=['source','target'])

# Count number of co-occurrences
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], as_index=False).sum()

In [22]:
from itertools import combinations
from collections import Counter

# Expand window size for co-occurrence
window_size = 20
relationships = []

for i in range(len(df_sentences_filtered)):
    end_i = min(i+window_size, len(df_sentences_filtered)-1)
    # collect countries within the window
    countries_in_window = sum(df_sentences_filtered.loc[i:end_i, 'country_entities'].tolist(), [])
    # count unique pairs
    for combo in combinations(set(countries_in_window), 2):
        relationships.append(tuple(sorted(combo)))

# Count co-occurrences
relationship_counts = Counter(relationships)

# Convert to DataFrame
relationship_df = pd.DataFrame(
    [(c1, c2, count) for (c1, c2), count in relationship_counts.items()],
    columns=['Country1', 'Country2', 'Weight']
).sort_values(by="Weight", ascending=False)

relationship_df.head(20)

Unnamed: 0,Country1,Country2,Weight
5,Germany,Russia,120
4,France,Germany,91
10,Germany,Italy,78
29,Germany,United Kingdom,66
2,France,Russia,64
33,Russia,United Kingdom,64
11,Italy,Russia,53
13,Germany,United States,52
32,France,United Kingdom,45
12,Italy,United States,45


In [30]:
relationship_df

Unnamed: 0,Country1,Country2,Weight
5,Germany,Russia,120
4,France,Germany,91
10,Germany,Italy,78
29,Germany,United Kingdom,66
2,France,Russia,64
...,...,...,...
150,Greece,Latvia,2
155,Greece,Lithuania,2
226,Belarus,Japan,2
225,Japan,Ukraine,2


In [24]:
relationship_df.to_csv("20th_century_country_relationships.csv", index=False)