In [2]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

!python -m spacy download en_core_web_sm

# Load spacy English module

NER = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 3.9 MB/s eta 0:00:04
     ------------- -------------------------- 4.5/12.8 MB 9.9 MB/s eta 0:00:01
     ------------------------ --------------- 7.9/12.8 MB 11.9 MB/s eta 0:00:01
     ----------------------------- ---------- 9.4/12.8 MB 12.0 MB/s eta 0:00:01
     ------------------------------ --------- 9.7/12.8 MB 10.8 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 10.2 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
#Open file
with open("Key_Events_20th_Century.txt", "r", errors="ignore") as file:
    data = file.read().replace('\n', ' ')
#Display a preview to inspect
print(data[:5000])

    Key events of the 20th century - Wikipedia                            Jump to content        Main menu      Main menu move to sidebar hide    		Navigation 	   Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us      		Contribute 	   HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages                    Search            Search                       Appearance                 Donate  Create account  Log in         Personal tools      Donate Create account Log in      		Pages for logged out editors learn more    ContributionsTalk                             Contents move to sidebar hide     (Top)      1 Historic events in the 20th century     Toggle Historic events in the 20th century subsection      1.1 World at the beginning of the century       1.1.1 "The war to end all wars": World War I (1914â€“1918)           1.2 Spanish flu       1.2.1 Russian Revolution and communism           1.3 Between the wars       1.3.1 Economic depression         

In [4]:
#Observations:
#The text includes Wikipedia navigation and special characters
#Country Names inconsistencies exist
#Some lines have extra newlines or tabs from wiki formatting.
#Clean by removing wiki artifacts, extra whitespace, and non-content sections

In [5]:
# Clean the text: Remove wiki artifacts, references, and extra whitespace
# Remove [edit]
cleaned_text = re.sub(r'\[edit\]', '', data) 
# Remove references like [123]
cleaned_text = re.sub(r'\[\d+\]', '', cleaned_text)  
# Remove ^
cleaned_text = re.sub(r'\^', '', cleaned_text)  
# Remove headers/footers
cleaned_text = re.sub(r'Jump to content|Main menu|move to sidebar|hide|Navigation|Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us|Contribute|HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages|Search|Appearance|Donate|Create account|Log in|Personal tools|Pages for logged out editors learn more|ContributionsTalk|Toggle the table of contents|languages|Edit links|ArticleTalk|English|ReadEditView history|Tools|Actions|General|What links hereRelated changesUpload filePermanent linkPage informationCite this pageGet shortened URLDownload QR code|Print/export|Download as PDFPrintable version|In other projects|Wikimedia CommonsWikidata item|From Wikipedia, the free encyclopedia|Retrieved from.*', '', cleaned_text, flags=re.DOTALL)
# Normalize whitespace
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  

cleaned_text = cleaned_text.strip()

In [6]:
# Save cleaned text as .txt
with open("Cleaned_Key_Events_20th_Century.txt", "w", encoding="utf-8") as file:
    file.write(cleaned_text)

# Preview cleaned text
print(cleaned_text[:1000])

Key events of the 20th century - Wikipedia Contents (Top) 1 Historic events in the 20th century Toggle Historic events in the 20th century subsection 1.1 World at the beginning of the century 1.1.1 "The war to end all wars": World War I (1914â€“1918) 1.2 Spanish flu 1.2.1 Russian Revolution and communism 1.3 Between the wars 1.3.1 Economic depression 1.3.2 The rise of dictatorship 1.4 Global war: World War II (1939â€“1945) 1.4.1 The war in Europe 1.4.2 Blitzkrieg 1.4.3 Operation Barbarossa 1.4.4 Turning tides 1.4.5 Operation Overlord 1.4.6 Final days 1.4.7 The war in the Pacific 1.4.7.1 Background 1.4.8 Japanese Expansion 1.4.9 Allied offensive 1.4.10 Final days 1.4.11 The Holocaust 1.4.12 The Nuclear Age begins 1.5 The post-war world 1.5.1 The end of empires: decolonization 1.5.2 The Cold War (1947â€“1991) 1.5.3 War by proxy 1.5.4 The space race 1.5.5 The end of the Cold War 1.5.6 Information and communications technology 1.6 The world at the end of the century 2 See also 3 References

In [7]:
# Process the cleaned text with spaCy NER
doc = NER(cleaned_text)

In [8]:
# Create a list of sentences and entities
df_sentences = []
for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent.text, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)


print(df_sentences.head())

                                            sentence  \
0  Key events of the 20th century - Wikipedia Con...   
1  The rise of dictatorship 1.4 Global war: World...   
2  The war in Europe 1.4.2 Blitzkrieg 1.4.3 Opera...   
3  Turning tides 1.4.5 Operation Overlord 1.4.6 F...   
4    1.4.9 Allied offensive 1.4.10 Final days 1.4.11   

                                            entities  
0  [the 20th century, 1, the 20th century, Toggle...  
1                  [1.4 Global, World War II, 1.4.1]  
2                                    [Europe, 1.4.4]  
3         [Operation Overlord 1.4.6 Final, Japanese]  
4               [1.4.9, Allied, 1.4.10, days 1.4.11]  


In [9]:
# Load countries list
countries_df = pd.read_csv("countries_list_20th_century_1.5.csv")
countries_df['country_name'] = countries_df['country_name'].str.strip().str.replace(r'^"|"$', '', regex=True)

In [10]:
# Define variation mapping for consistency
country_variations = {
    'US': 'United States',
    'USA': 'United States',
    'America': 'United States',
    'Britain': 'United Kingdom',
    'UK': 'United Kingdom',
    'Soviet Union': 'Russia',
    'USSR': 'Russia',
    'Great Britain': 'United Kingdom',
    'Congo Free State': 'Congo, Democratic Republic of the',
    'Persia': 'Iran',
    'North Korea': 'Korea, North',
    'South Korea': 'Korea, South'}

In [11]:
# Define function to filter entities
def filter_entity(ent_list, countries_df):
    return [country_variations.get(ent, ent) for ent in ent_list if country_variations.get(ent, ent) in countries_df['country_name'].tolist()]

In [12]:
# Apply filter
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries_df))

In [13]:
# Filter out rows with no countries
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

print(df_sentences_filtered.head())

                                             sentence  \
22  After a period of diplomatic and military esca...   
23  In 1917, Russia ended hostile actions against ...   
24  The Bolsheviks negotiated the Treaty of Brest-...   
25  In the treaty, Bolshevik Russia ceded the Balt...   
26    It also recognized the independence of Ukraine.   

                                             entities  \
22  [the July Crisis, the end of July 1914, Britis...   
23           [1917, Russia, the Central Powers, Tsar]   
24     [the Treaty of Brest-Litovsk, Germany, Russia]   
25  [Bolshevik Russia, Baltic, Germany, Kars Oblas...   
26                                          [Ukraine]   

              country_entities  
22  [France, Austria, Hungary]  
23                    [Russia]  
24           [Germany, Russia]  
25                   [Germany]  
26                   [Ukraine]  


In [14]:
# Create relationships
relationships = []

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + 5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i:end_i].country_entities), [])
    char_unique = [char_list[i] for i in range(len(char_list)) if (i == 0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

# Create DataFrame
relationships_df = pd.DataFrame(relationships)

# Preview
print(relationships_df.head(10))

    source   target
0   France  Austria
1  Austria  Hungary
2   France  Austria
3  Austria  Hungary
4  Hungary   Russia
5   France  Austria
6  Austria  Hungary
7  Hungary   Russia
8   Russia  Germany
9  Germany   Russia


In [17]:
#Sort each row alphabetically
relationships_df = pd.DataFrame(
    np.sort(relationships_df.values, axis=1),
    columns=relationships_df.columns)

In [18]:
relationship_counts = relationships_df.value_counts().reset_index(name="count")

print(relationship_counts.head())

    source          target  count
0    Japan   United States     39
1  Germany          Russia     33
2  Germany           Italy     26
3   France  United Kingdom     24
4  Germany          Poland     20


In [21]:
# Save to CSV
relationship_counts.to_csv("country_relationships.csv", index=False)