In [1]:
import pandas as pd
import spacy
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
import scipy
import re
import os

In [2]:
os.listdir()

['1.4.ipynb',
 '1.6.ipynb',
 'country_relationships_counts.csv',
 'country_relationships.csv',
 'key_events_20th_century.txt',
 '20th_century_scrape.ipynb',
 'anaconda_projects',
 'Untitled.ipynb',
 '1.7.ipynb',
 'Final 1.7.ipynb',
 '1.5.ipynb',
 '1.3.ipynb',
 'countries_network_interactive.html',
 '.gitignore',
 '.virtual_documents',
 'countries_network_communities.html',
 'lib',
 '.ipynb_checkpoints',
 'cleaned_twentieth_century.txt',
 'key_events_20th_century_clean.txt',
 'venv',
 '1.6 Part 2 .ipynb',
 '.git',
 'countries_list_20th_century_1.5.csv']

In [3]:
# Loading the countries list
countries_df = pd.read_csv('countries_list_20th_century_1.5.csv')
countries_df.head()

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola


In [5]:
#Loading the 20th century file
with open("/Users/divyaneopaney/Downloads/key_events_20th_century_clean.txt", "r", encoding="utf-8") as f:
    text = f.read()
# Quick preview
print(text[:50])

  Key events of the 20th century - Wikipedia      


In [8]:
# Cleaning the country names
countries = countries_df['country_name'].dropna().str.strip().unique().tolist()

print(f"Total countries loaded: {len(countries)}")

Total countries loaded: 208


In [9]:
# Basic cleaning
text = text.replace('\n', ' ').replace('\r', ' ')
text = text.replace('—', '-').replace('’', "'")

In [10]:
with open('cleaned_twentieth_century.txt', 'w', encoding='utf-8') as f:
    f.write(text)

In [11]:
#Creating NER Object
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

In [15]:
# Create a mapping from lowercase country name -> canonical country name
country_map = {c.lower(): c for c in countries}

# Build df_sentences from spaCy doc
df_sentences = []

for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]  # all entities spaCy found
    df_sentences.append({
        "sentence": sent.text,
        "entities": entity_list
    })

df_sentences = pd.DataFrame(df_sentences)
df_sentences.head()

Unnamed: 0,sentence,entities
0,Key events of the 20th century - Wikipedia ...,[the 20th century -]
1,articleAbout WikipediaContact us \t\tCont...,[Search Search ...
2,The rise of dictatorship 1.4 Global ...,"[1.4, World War II, 1939-1945, 1.4.1]"
3,The war in Europe 1.4.2 Blitzkrieg ...,"[Europe, 1.4.2, Blitzkrieg 1.4.3, Oper..."
4,Turning tides,[]


In [16]:
def filter_country_entities(ent_list, country_map):
    filtered = []
    for ent in ent_list:
        if not isinstance(ent, str):
            continue
        ent_norm = ent.strip().lower()
        if ent_norm in country_map:
            # use canonical name from your countries list
            filtered.append(country_map[ent_norm])
    return filtered

# Apply to each row
df_sentences["country_entities"] = df_sentences["entities"].apply(
    lambda ents: filter_country_entities(ents, country_map)
)

# Keep only sentences that actually mention at least one country
df_sentences_filtered = df_sentences[
    df_sentences["country_entities"].map(len) > 0
].reset_index(drop=True)

print("Total sentences:", len(df_sentences))
print("Sentences with at least one country:", len(df_sentences_filtered))
df_sentences_filtered.head()

Total sentences: 1639
Sentences with at least one country: 126


Unnamed: 0,sentence,entities,country_entities
0,After a period of diplomatic and military esca...,"[the July Crisis, the end of July 1914, the Br...","[France, Austria]"
1,"In 1917, Russia ended hostile actions against ...","[1917, Russia, the Central Powers, Tsar]",[Russia]
2,The Bolsheviks negotiated the Treaty of Brest-...,"[Bolsheviks, Germany, Russia]","[Germany, Russia]"
3,"In the treaty, Bolshevik Russia ceded the Balt...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
4,It also recognized the independence of Ukraine...,"[Germany, Allied, American, 1918.[4]",[Germany]


In [23]:
# Defining relationships
window_size = 5   # how many sentences to look at simultaneously
relationships = []  # empty list to store all pairs

for i in range(df_sentences_filtered.index[-1] + 1):
    end_i = min(i + window_size, df_sentences_filtered.index[-1])
    
    # Concatenate all country lists from i to end_i
    country_list = sum(df_sentences_filtered.loc[i:end_i, "country_entities"], [])
    
    # Remove duplicated countries that are next to each other
    country_unique = [
        country_list[j] for j in range(len(country_list))
        if j == 0 or country_list[j] != country_list[j - 1]
    ]
    
    # If there are at least two distinct countries, create relationships
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"country1": a, "country2": b})

# Turn into DataFrame
relationship_df = pd.DataFrame(relationships)
print("Raw relationships (with direction):", len(relationship_df))
relationship_df.head()

Raw relationships (with direction): 802


Unnamed: 0,country1,country2
0,France,Austria
1,Austria,Russia
2,Russia,Germany
3,Germany,Russia
4,Russia,Germany


In [25]:
# Sort the cases so a->b and b->a are treated as the same
relationships_df = pd.DataFrame(
    np.sort(relationship_df.values, axis=1),
    columns=relationship_df.columns
)

# Add helper column for counting
relationships_df["count"] = 1

# Group by pair and sum
relationships_df = (
    relationships_df
    .groupby(["country1", "country2"], as_index=False)["count"]
    .sum()
    .sort_values("count", ascending=False)
    .reset_index(drop=True)
)

print("Top 10 country relationships:")
print(relationships_df.head(10))

# Highest relationship count
top_value = relationships_df["count"].iloc[0]
top_pair = relationships_df.iloc[0][["country1", "country2"]].tolist()

print(f"\nHighest relationship count: {top_value}")
print("Top pair:", top_pair)

# Save to CSV for later use / visualization
relationships_df.to_csv("country_relationships_final.csv", index=False)
print("\nSaved to: country_relationships_final.csv")

Top 10 country relationships:
  country1     country2  count
0  Germany        Japan     33
1  Germany        Italy     32
2  Germany       Poland     28
3    Japan       Russia     22
4    Japan  Philippines     21
5    India     Pakistan     17
6   France      Germany     17
7   France       Poland     16
8    Egypt        Libya     16
9    Italy        Japan     16

Highest relationship count: 33
Top pair: ['Germany', 'Japan']

Saved to: country_relationships_final.csv
