# Install the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
      --------------------------------------- 0.3/12.8 MB 2.9 MB/s eta 0:00:05
     --- ------------------------------------ 1.1/12.8 MB 8.5 MB/s eta 0:00:02
     ------ --------------------------------- 2.1/12.8 MB 12.4 MB/s eta 0:00:01
     ---------- ----------------------------- 3.5/12.8 MB 15.9 MB/s eta 0:00:01
     --------------- ------------------------ 4.9/12.8 MB 18.3 MB/s eta 0:00:01
     ------------------- -------------------- 6.3/12.8 MB 20.0 MB/s eta 0:00:01
     ------------------------ --------------- 7.8/12.8 MB 21.5 MB/s eta 0:00:01
     ----------------------------- ---------- 9.6/12.8 MB 23.5 MB/s eta 0:00:01
     -----------------------------

In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

# Load the twentieth-century text file

In [4]:
with open('Key_Events_of_the_20th_Century_Wiki.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

# Use the text file to create a NER object

In [5]:
book = NER(data)

# Evaluate whether the text needs wrangling

In [6]:
# Extract Country Names:
country_entities = [ent.text for ent in book.ents if ent.label_ == "GPE"]

# Print the identified country names
print(set(country_entities))

{'USSR', 'Australia', 'Berlin', 'North', 'Austria-Hungary', 'Nanjing', 'Guadalcanal', 'Tobruk', 'Philippines', "the People's Republic of Bangladesh", 'Burma', 'Iran', 'Operation Overlord', 'France', 'the Ottoman Empire', 'Romania', 'Iraq', 'Nazi Germany', 'Stalingrad', 'Saigon', 'Neville', 'Vietnam', 'Saint Petersburg', 'Egypt', 'London', 'North Africa', 'Nuremberg', 'Papua New Guinea', 'Ghana', 'Stephen', 'Moscow', 'Lithuania', 'Finland', 'Eastern Poland', 'Koreas', 'Britain', 'the Solomon Islands', 'Italy', 'Warsaw', 'Munich', 'Poland', 'war.[151]Nuclear', 'Yugoslavia', 'Algeria', 'Austria', 'Persia', 'Sweden', 'Stalin', 'Hungary', "North Korea's", 'Thailand', 'Hiroshima', 'Solomon Islands', 'Roma', 'British Empire', 'United States', 'The Soviet State', 'Hawaii', 'East Berlin', 'New Orleans', 'Rome', 'Gravel', 'Wikipedia', 'India', 'Tokyo', 'Norway', 'Cuba', 'USA', 'Italian Social Republic', 'Cambodia', 'Bolsheviks', 'Hong Kong', 'Soviet Union', 'South Korea', 'Antonia', 'New states'

Based on my observations, the following changes will be made:

Replace 'USA', 'US', 'U.S.' in the text with 'The United States'.

Replace 'USSR' with 'Russia'.

Add 'Soviet Union' to the list of countries.

In [7]:
# Create a manual corrections dictionary
manual_corrections = {
    "USA": "United States",
    "US": "United States",
    "U.S.": "United States",
    "U.S.S.R.": "Russia",
  
}

In [8]:
corrected_data = data
for ent in book.ents:
    if ent.label_ == "GPE":
        if ent.text in manual_corrections:
            corrected_data = corrected_data.replace(ent.text, manual_corrections[ent.text])


In [9]:
corrected_book = NER(corrected_data)

In [10]:
# Extract Country Names:
country_entities = [ent.text for ent in corrected_book.ents if ent.label_ == "GPE"]

# Print the identified country names
print(set(country_entities))

{'North', 'Australia', 'Berlin', 'Austria-Hungary', 'Nanjing', 'Guadalcanal', 'Tobruk', 'Philippines', "the People's Republic of Bangladesh", 'Burma', 'Iran', 'Operation Overlord', 'France', 'the Ottoman Empire', 'Romania', 'Iraq', 'Nazi Germany', 'Stalingrad', 'Saigon', 'Neville', 'Vietnam', 'Saint Petersburg', 'Egypt', 'London', 'North Africa', 'Nuremberg', 'Papua New Guinea', 'Ghana', 'Stephen', 'Moscow', 'Lithuania', 'Finland', 'Eastern Poland', 'Koreas', 'Britain', 'the Solomon Islands', 'Italy', 'Warsaw', 'Munich', 'Poland', 'war.[151]Nuclear', 'Yugoslavia', 'Algeria', 'Austria', 'States', 'Persia', 'Sweden', 'Stalin', 'Hungary', "North Korea's", 'Thailand', 'Hiroshima', 'Solomon Islands', 'Roma', 'United States', 'British Empire', 'The Soviet State', 'Hawaii', 'East Berlin', 'New Orleans', 'Rome', 'Gravel', 'Wikipedia', 'India', 'Tokyo', 'Norway', 'Cuba', 'Italian Social Republic', 'Cambodia', 'Bolsheviks', 'Hong Kong', 'Soviet Union', 'South Korea', 'Antonia', 'New states', 'Ya

# Split the sentence entities from the NER object.

In [11]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in corrected_book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [12]:
df_sentences.head()

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Navigation\..."
1,"(the, wars1.2.1Economic)",[]
2,"(depression1.2.2The, rise, of, dictatorship1.3...",[]
3,"(World, War, II, (, 1939â€“1945)1.3.1The, war,...",[World War II]
4,"(days1.3.7The, war, in, the, Pacific1.3.7.1Bac...","[Pacific1.3.7.1Background1.3.8Japanese, Holoca..."


# Filter the entities so that you end up only with the ones from your countries list.

In [13]:
# Import country name

country_df = pd.read_csv("Country_name.csv", index_col = 0)

In [14]:
country_df.head()

Unnamed: 0,Country
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [15]:
# Function to filter out entities not of interest

def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list 
            if ent in list(country_df['Country'])]

In [16]:
# Check

filter_entity(["Afghanistan", "CF", "2"], country_df)

['Afghanistan']

In [17]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [18]:
df_sentences['country_entities'].head(50)

0                              []
1                              []
2                              []
3                              []
4                              []
5                              []
6                              []
7                              []
8                              []
9                              []
10                             []
11                             []
12                             []
13                             []
14                             []
15                             []
16                             []
17               [France, Russia]
18    [Germany, Bulgaria, Russia]
19              [Germany, Russia]
20                      [Germany]
21                             []
22                      [Germany]
23                             []
24                             []
25                             []
26                             []
27                             []
28                      [Germany]
29            

In [19]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [20]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
843,"("", The, forgotten, violence, that, helped, In...",[India],[India]
846,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, Partition, India, Pa...","[India, Pakistan]"
852,"(Retrieved, 2018, -, 12, -, 18.^, "", The, Phil...","[2018-12-18.^, Philippines, 1898â€“1946, Unite...","[Philippines, United States]"
900,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
909,"(Now, ,, North, Korea, may, be, the, one, true...","[North Korea, one]",[North Korea]
944,"("", Selling, "", Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
967,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Vietnam, the Battle of the Paris Peace Table,...",[Vietnam]
1142,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American, the Middle East, Lebanon]",[Lebanon]
1145,"(The, Rise, of, China, and, India, :, A, New, ...","[China, India, New Asian Drama]","[China, India]"
1146,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


# Create the relationships dataframe.

In [22]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [23]:
relationship_df = pd.DataFrame(relationships)

In [24]:
relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Bulgaria
4,Bulgaria,Russia
...,...,...
743,India,Singapore
744,China,India
745,India,Singapore
746,China,India


In [26]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [27]:
relationship_df

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,26
2,Bulgaria,Germany,6
3,Bulgaria,Russia,6
4,Germany,Italy,26
...,...,...,...
89,Germany,United States,1
90,Philippines,United States,6
91,Romania,Russia,6
92,China,Lebanon,3


# Save and export your dataframe. 

In [28]:
relationship_df.to_csv('country_relationship.csv')