# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 4.2 MB/s eta 0:00:03
     --- ------------------------------------ 1.0/12.8 MB 3.0 MB/s eta 0:00:04
     ------- -------------------------------- 2.4/12.8 MB 4.3 MB/s eta 0:00:03
     -------------- ------------------------- 4.7/12.8 MB 6.5 MB/s eta 0:00:02
     ------------------- -------------------- 6.3/12.8 MB 6.7 MB/s eta 0:00:01
     ---------------------- ----------------- 7.1/12.8 MB 6.4 MB/s eta 0:00:01
     -------------------------- ------------- 8.4/12.8 MB 6.3 MB/s eta 0:00:01
     ------------------------------ --------- 9.7/12.8 MB 6.2 MB/s eta 0:00:01
     ------------------------------------ --- 11.5/12.8 MB 6.4 MB/s eta 0:00:01
     -----------------------------------

In [3]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

# Load the twentieth-century text file

In [4]:
# Import txt file
with open('Key_Events_20th_Century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

# Data Wrangling

In [5]:
# Clean text using re.sub
cleaned_text = re.sub(r'[^\w\s]', '', data.lower())

In [6]:
# Sentence tokenization
from nltk.tokenize import sent_tokenize
tokenized_sent = sent_tokenize(cleaned_text)
print(tokenized_sent)  # Inspect tokenized sentences

# Word tokenization
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(cleaned_text)
print(tokenized_word)  # Inspect tokenized words



In [7]:
# Load country names from CSV
countries_data = pd.read_csv('countries_list_20th_century_1.5.csv')

In [8]:
countries_data.head()

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola


In [9]:
# Dropping 'Unnamed: 0' Column
countries_data = countries_data.drop(columns=['Unnamed: 0'])
countries_data.head()

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [10]:
# Clean up the country names column and convert it to a list
countries_list = countries_data['country_name'].str.strip().str.lower().tolist()

In [11]:
# Tokenize the text
from nltk.tokenize import word_tokenize
dist_words = word_tokenize(cleaned_text)

# Convert the tokenized list into a single string
listToStr = ' '.join([str(elem).lower() for elem in dist_words])

# Clean the tokenized words from unwanted characters and count occurrences
from collections import Counter
all_counts = Counter(re.sub(r'\W+', ' ', listToStr).split())

In [12]:
# Count mentions of each country
country_mentions = {country: all_counts.get(country, 0) for country in countries_list}

# Convert the dictionary to a DataFrame and sort by mentions
import pandas as pd
country_mentions_df = pd.DataFrame(list(country_mentions.items()), columns=['Country', 'Mentions']).sort_values(by='Mentions', ascending=False)

# Display the DataFrame
print(country_mentions_df)

          Country  Mentions
64        germany        37
85          japan        30
140        poland        13
60         france        13
82          italy        12
..            ...       ...
84        jamaica         0
86         jordan         0
87     kazakhstan         0
89       kiribati         0
207  transnistria         0

[208 rows x 2 columns]


In [13]:
output_file_path = 'Cleaned_Key_Events_20th_Century.txt'

# Save the cleaned text to a .txt file
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

print(f"Cleaned text has been saved to {output_file_path}")

Cleaned text has been saved to Cleaned_Key_Events_20th_Century.txt


In [14]:
# Import cleaned txt file
with open('Cleaned_Key_Events_20th_Century.txt', 'r', errors='ignore') as file:
    cleaned_data = file.read()

# Use the text file to create a NER object

In [15]:
events = NER(cleaned_data)

In [16]:
# Visualize identified entities
displacy.render(events[273:20000], style = "ent", jupyter = True)

# Split the sentence entities from the NER object

In [17]:
df_sentences = [] # empty shell to store results
# Loop through sentences, get entity list for each sentence
for sent in events.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [18]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(key, events, of, the, 20th, century, , wikip...","[the 20th century, 20th, the 20th century, com..."
1,"(great, depressionafter, world, war, i, the, g...","[the 1920s, 29, the end of the decade, million..."
2,"(the, help, they, delivered, to, poland, was, ...","[poland, negligible36, sixteen days later, pol..."
3,"(denmark, surrendered, after, only, six, hours...","[only six, hours4546, scandinavian, germany, f..."
4,"(lion58the, importance, of, the, battle, of, b...","[britain, first, secondly, britain, the soviet..."
5,"(but, the, second, was, not, used, and, a, tab...","[second, luck81, paris, german, joseph goebbel..."
6,"(the, surrender, was, accepted, by, general, d...","[douglas macarthur, millions, jews, world war ..."
7,"(warsaw, berlin, prague, vienna, budapest, bel...","[warsaw, berlin, prague, vienna, budapest, bel..."
8,"(the, creation, of, operating, systems, also, ...","[2, methods237238, thousands, zx80, pet239240i..."
9,"(reverse, engineered, the, bios, and, released...","[architecture242243, company244the 1980s, us, ..."


# Filter the entities so that you end up only with the ones from your countries list

In [19]:
# Function to filter out entities not of interest
def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list 
            if ent in list(countries_data['country_name'])]

In [20]:
# Apply the filter function to the 'entities' column
df_sentences['filtered_entities'] = df_sentences['entities'].apply(lambda ents: filter_entity(ents, countries_data))

# Display the resulting DataFrame with filtered entities
print(df_sentences.head())

                                            sentence  \
0  (key, events, of, the, 20th, century,  , wikip...   
1  (great, depressionafter, world, war, i, the, g...   
2  (the, help, they, delivered, to, poland, was, ...   
3  (denmark, surrendered, after, only, six, hours...   
4  (lion58the, importance, of, the, battle, of, b...   

                                            entities filtered_entities  
0  [the 20th century, 20th, the 20th century, com...                []  
1  [the 1920s, 29, the end of the decade, million...                []  
2  [poland, negligible36, sixteen days later, pol...                []  
3  [only six, hours4546, scandinavian, germany, f...                []  
4  [britain, first, secondly, britain, the soviet...                []  


In [21]:
# Function to filter out entities not matching country names
def filter_country_entities(sent):
    return [ent.text for ent in sent.ents if ent.label_ == "GPE" and ent.text.lower() in countries_list]

# Apply the filter to extract only country entities
df_sentences['filtered_entities'] = df_sentences['sentence'].apply(lambda sent: filter_country_entities(sent))

# Display the resulting DataFrame with filtered country entities
print(df_sentences.head())

                                            sentence  \
0  (key, events, of, the, 20th, century,  , wikip...   
1  (great, depressionafter, world, war, i, the, g...   
2  (the, help, they, delivered, to, poland, was, ...   
3  (denmark, surrendered, after, only, six, hours...   
4  (lion58the, importance, of, the, battle, of, b...   

                                            entities  \
0  [the 20th century, 20th, the 20th century, com...   
1  [the 1920s, 29, the end of the decade, million...   
2  [poland, negligible36, sixteen days later, pol...   
3  [only six, hours4546, scandinavian, germany, f...   
4  [britain, first, secondly, britain, the soviet...   

                                   filtered_entities  
0  [panama, france, russia, germany, bulgaria, ru...  
1  [germany, italy, germany, germany, germany, ge...  
2  [poland, poland, poland, germany, estonia, lat...  
3  [germany, france, france, france, italy, germany]  
4  [albania, libya, egypt, libya, egypt, iraq, ja..

# Create the relationships dataframe

In [22]:
# Set the window size for the sliding window
window_size = 5

# Initialize an empty list to store relationships
relationships = []

# Loop through the sentences with a sliding window approach
for i in range(len(df_sentences) - window_size + 1):
    # Determine the end index of the current window
    end_i = i + window_size
    
    # Collect all country entities from the current window of sentences
    country_entities = sum(df_sentences.loc[i:end_i, 'filtered_entities'], [])
    
    # Remove consecutive duplicates in the country entities
    unique_countries = [country_entities[j] for j in range(len(country_entities)) if (j == 0) or (country_entities[j] != country_entities[j-1])]
    
    # Record relationships between consecutive unique countries
    if len(unique_countries) > 1:
        for idx, source in enumerate(unique_countries[:-1]):
            target = unique_countries[idx + 1]
            relationships.append({"source": source, "target": target})

# Convert the list of relationships into a DataFrame
relationships_df = pd.DataFrame(relationships)

# Display the relationships DataFrame
print(relationships_df.head())

     source    target
0    panama    france
1    france    russia
2    russia   germany
3   germany  bulgaria
4  bulgaria    russia


In [23]:
# Sort each row to ensure that 'source' and 'target' are in lexicographical order
relationships_df = pd.DataFrame(np.sort(relationships_df.values, axis=1), columns=relationships_df.columns)

# Display the first five rows of the sorted DataFrame
print(relationships_df.head())

     source   target
0    france   panama
1    france   russia
2   germany   russia
3  bulgaria  germany
4  bulgaria   russia


In [24]:
# Add a 'value' column to indicate each relationship occurrence
relationships_df["value"] = 1

# Group by 'source' and 'target' and sum the 'value' to count occurrences
relationships_df = relationships_df.groupby(["source", "target"], sort=False, as_index=False).sum()

# Display the summarized relationships DataFrame
print(relationships_df.head())

     source   target  value
0    france   panama      1
1    france   russia      7
2   germany   russia      9
3  bulgaria  germany      1
4  bulgaria   russia      1


# Save and export your dataframe

In [26]:
relationships_df.to_csv('Key_Events_20th_Century_relationship.csv')