## Importing Libraries

In [1]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
import spacy

# Load the language model (corrected)
NER = spacy.load("en_core_web_sm")

# Specify the file path (replace with your actual file path)
file_path = "key_events_20th_century.txt"

# Read the text from the file
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Process the text with spaCy
doc = NER(text)

# Print named entities
for ent in doc.ents:
    print(ent.text, ent.label_)


The 20th century DATE
The World Wars ORG
the Cold War EVENT
the Space Race ORG
the World Wide Web EVENT
the 21st century DATE
today DATE
the 20th century DATE
the beginning of the century DATE
the 20th century DATE
The 1900s DATE
the decade DATE
1914 CARDINAL
the Panama Canal FAC
1914 to 1918 DATE
the First World War EVENT
World War I EVENT
1914–1918 CARDINAL
World War I Arrest EVENT
Sarajevo GPE
Archduke Franz Ferdinand PERSON
WWI ORG
The Great War WORK_OF_ART
July 1914 DATE
November 1918 DATE
Sarajevo GPE
the Austro-Hungarian Empire 's FAC
Erzherzog Franz Ferdinand PERSON
Gavrilo Princip PERSON
Young Bosnia FAC
the July Crisis EVENT
the end of July 1914 DATE
the British Empire GPE
France GPE
the Russian Empire GPE
the Central Powers ORG
the German Empire GPE
Austria GPE
Hungary GPE
1 CARDINAL
2 CARDINAL
1917 DATE
Russia GPE
the Central Powers ORG
Tsar GPE
Bolsheviks GPE
Germany GPE
Russia GPE
Bolshevik Russia ORG
Baltic LOC
Germany GPE
Kars Oblast LOC
the South Caucasus LOC
the Ottom

In [3]:
# Load the text

with open('key_events_20th_century.txt', 'r', errors='ignore') as file: 
   text = file.read().replace( '\n', ' ')

doc = NER(text)

In [4]:
# Visualize identified entities

displacy.render(doc[273:20000], style = "ent", jupyter = True)

## Comments on question 4

The text requires a bit of wrangling for the Countries appearing by their first names, making it easy to compare with the names on the scraped Countries list. By all standards, the names of countries are the same in both documents.
I have also observed that the countries have been categorized as "GPE", years as "DATE", and names of persons as "PERSON" making it easy to split the sentence entities.

## creating a dataframe from a list of dictionaries

In [5]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in doc.sents:
       entity_list = [ent.text for ent in sent.ents]
       df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [6]:
df_sentences.head(20)

Unnamed: 0,sentence,entities
0,"(The, 20th, century, changed, the, world, in, ...",[The 20th century]
1,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,..."
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
3,"(Historic, events, in, the, 20th, century, [, ...","[the 20th century, the beginning of the centur..."
4,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
5,"(1914, saw, the, completion, of, the, Panama, ...","[1914, the Panama Canal]"
6,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"
7,"("", The, war, to, end, all, wars, "", :, World,...","[World War I, World War I Arrest, Sarajevo, Ar..."
8,"(The, war, was, precipitated, by, the, Assassi...","[Sarajevo, the Austro-Hungarian Empire 's, Erz..."
9,"(After, a, period, of, diplomatic, and, milita...","[the July Crisis, the end of July 1914, the Br..."


## Filtering Data Using the Countries

In [7]:
# Import txt file and read the countries
with open('Countries_List.txt', 'r', errors='ignore') as file: 
    # Read all lines and strip any extra whitespace
    data = file.read().strip()

# Split the data into a list by newlines
countries_list = data.split('\n')

# Optionally, you can strip leading/trailing spaces from each country
countries_list = [country.strip() for country in countries_list]

# Print the resulting list
print(countries_list)


['country_name', '1\t Afghanistan', '2\t  Albania', '3\t  Algeria', '4\t  Andorra', '5\t  Angola', '6\t  Antigua and Barbuda', '7\t  Argentina', '8\t  Armenia', '9\t  Australia', '10\t  Austria', '11\t  Azerbaijan', '12\t" Bahamas, The "', '13\t  Bahrain', '14\t  Bangladesh', '15\t  Barbados', '16\t  Belarus', '17\t  Belgium', '18\t  Belize', '19\t  Benin', '20\t  Bhutan', '21\t  Bolivia', '22\t  Bosnia and Herzegovina', '23\t  Botswana', '24\t  Brazil', '25\t  Brunei', '26\t  Bulgaria', '27\t  Burkina Faso', '28\t  Burundi', '29\t Cambodia', '30\t  Cameroon', '31\t  Canada', '32\t  Cape Verde', '33\t  Central African Republic', '34\t  Chad', '35\t  Chile', '36\t"  China, People\'s Republic of "', '37\t  Colombia', '38\t  Comoros', '39\t"  Congo, Democratic Republic of the "', '40\t"  Congo, Republic of the "', '41\t  Costa Rica', '42\t  Croatia', '43\t  Cuba', '44\t  Cyprus', '45\t  Czech Republic', '46\t  Denmark', '47\t   Djibouti', '48\t   Dominica', '49\t   Dominican Republic', '5

In [8]:
# Your original data (list of country names with indices)
country_list_raw = [
    '1\t Afghanistan', '2\t  Albania', '3\t  Algeria', '4\t  Andorra', '5\t  Angola', 
    '6\t  Antigua and Barbuda', '7\t  Argentina', '8\t  Armenia', '9\t  Australia', 
    '10\t  Austria', '11\t  Azerbaijan', '12\t" Bahamas, The "', '13\t  Bahrain', 
    '14\t  Bangladesh', '15\t  Barbados', '16\t  Belarus', '17\t  Belgium', '18\t  Belize', 
    '19\t  Benin', '20\t  Bhutan', '21\t  Bolivia', '22\t  Bosnia and Herzegovina', 
    '23\t  Botswana', '24\t  Brazil', '25\t  Brunei', '26\t  Bulgaria', '27\t  Burkina Faso', 
    '28\t  Burundi', '29\t Cambodia', '30\t  Cameroon', '31\t  Canada', '32\t  Cape Verde', 
    '33\t  Central African Republic', '34\t  Chad', '35\t  Chile', '36\t"  China, People\'s Republic of "', 
    '37\t  Colombia', '38\t  Comoros', '39\t"  Congo, Democratic Republic of the "', 
    '40\t"  Congo, Republic of the "', '41\t  Costa Rica', '42\t  Croatia', '43\t  Cuba', 
    '44\t  Cyprus', '45\t  Czech Republic', '46\t  Denmark', '47\t   Djibouti', '48\t   Dominica', 
    '49\t   Dominican Republic', '50\t  East Timor', '51\t   Ecuador', '52\t   Egypt', 
    '53\t   El Salvador', '54\t   Equatorial Guinea', '55\t   Eritrea', '56\t   Estonia', 
    '57\t   Eswatini', '58\t   Ethiopia', '59\t  Fiji', '60\t   Finland', '61\t   France', 
    '62\t  Gabon', '63\t"   Gambia, The "', '64\t   Georgia', '65\t   Germany', '66\t   Ghana', 
    '67\t   Greece', '68\t   Grenada', '69\t   Guatemala', '70\t   Guinea', '71\t   Guinea', 
    '72\tBissau', '73\t   Guyana', '74\t  Haiti', '75\t   Honduras', '76\t   Hungary', 
    '77\t  Iceland', '78\t   India', '79\t   Indonesia', '80\t   Iran', '81\t   Iraq', '82\t   Ireland', 
    '83\t   Israel', '84\t   Italy', '85\t   Ivory Coast', '86\t  Jamaica', '87\t   Japan', 
    '88\t   Jordan', '89\t  Kazakhstan', '90\t   Kenya', '91\t   Kiribati', '92\t"   Korea, North "', 
    '93\t"   Korea, South "', '94\t   Kuwait', '95\t   Kyrgyzstan', '96\t  Laos', '97\t   Latvia', 
    '98\t   Lebanon', '99\t   Lesotho', '100\t   Liberia', '101\t   Libya', '102\t   Liechtenstein', 
    '103\t   Lithuania', '104\t   Luxembourg', '105\t  Madagascar', '106\t   Malawi', '107\t   Malaysia', 
    '108\t   Maldives', '109\t   Mali', '110\t   Malta', '111\t   Marshall Islands', '112\t   Mauritania', 
    '113\t   Mauritius', '114\t   Mexico', '115\t"   Micronesia, Federated States of "', 
    '116\t   Moldova', '117\t   Monaco', '118\t   Mongolia', '119\t   Montenegro', '120\t   Morocco', 
    '121\t   Mozambique', '122\t   Myanmar', '123\t  Namibia', '124\t   Nauru', '125\t     Nepal', 
    '126\t   Netherlands', '127\t   New Zealand', '128\t   Nicaragua', '129\t   Niger', '130\t   Nigeria', 
    '131\t   North Macedonia', '132\t   Norway', '133\t  Oman', '134\t  Pakistan', '135\t   Palau', 
    '136\t   Palestine', '137\t   Panama', '138\t   Papua New Guinea', '139\t   Paraguay', 
    '140\t   Peru', '141\t   Philippines', '142\t   Poland', '143\t   Portugal', '144\t  Qatar', 
    '145\t  Romania', '146\t   Russia', '147\t   Rwanda', '148\t  Saint Kitts and Nevis', 
    '149\t   Saint Lucia', '150\t   Saint Vincent and the Grenadines', '151\t   Samoa', '152\t   San Marino', 
    '153\t   São Tomé and Príncipe', '154\t   Saudi Arabia', '155\t   Senegal', '156\t   Serbia', 
    '157\t   Seychelles', '158\t   Sierra Leone', '159\t   Singapore', '160\t   Slovakia', 
    '161\t   Slovenia', '162\t   Solomon Islands', '163\t   Somalia', '164\t   South Africa', 
    '165\t   South Sudan', '166\t   Spain', '167\t   Sri Lanka', '168\t   Sudan', '169\t   Suriname', 
    '170\t   Sweden', '171\t    Switzerland', '172\t   Syria', '173\t  Tajikistan', '174\t   Tanzania', 
    '175\t   Thailand', '176\t   Togo', '177\t   Tonga', '178\t   Trinidad and Tobago', '179\t   Tunisia', 
    '180\t   Turkey', '181\t   Turkmenistan', '182\t   Tuvalu', '183\t  Uganda', '184\t   Ukraine', 
    '185\t   United Arab Emirates', '186\t   United Kingdom', '187\t   United States', '188\t   Uruguay', 
    '189\t   Uzbekistan', '190\t  Vanuatu', '191\t    Vatican City (Holy See)', '192\t   Venezuela', 
    '193\t   Vietnam', '194\t  Yemen', '195\t  Zambia', '196\t   Zimbabwe', '197\t  Abkhazia', 
    '198\t   Artsakh', '199\t   Cook Islands', "200\t   Donetsk People's Republic", '201\t   Kosovo', 
    "202\t   Luhansk People's Republic", '203\t   Niue', '204\t   Northern Cyprus', 
    '205\t   Sahrawi Arab Democratic Republic', '206\t   Somaliland', '207\t   South Ossetia', 
    '208\t   Taiwan', '209\t   Transnistria'
]

# Clean the data: Separate the number and country name
country_dict = {}
for entry in country_list_raw:
    parts = entry.split('\t')
    number = parts[0].strip()  # Country number (e.g., '1')
    country_name = parts[1].strip().replace('"', '').replace('“', '').replace('”', '')  # Remove extra quotes and spaces
    country_dict[number] = country_name

# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(list(country_dict.items()), columns=["Index", "Country"])

# Display the entire DataFrame (list of 208 countries)
pd.set_option('display.max_rows', 208)  # Ensure that all rows are displayed
print(df)

    Index                           Country
0       1                       Afghanistan
1       2                           Albania
2       3                           Algeria
3       4                           Andorra
4       5                            Angola
..    ...                               ...
204   205  Sahrawi Arab Democratic Republic
205   206                        Somaliland
206   207                     South Ossetia
207   208                            Taiwan
208   209                      Transnistria

[209 rows x 2 columns]


In [9]:
# Create a DataFrame with 'COUNTRIES' column
chars = pd.DataFrame(countries_list, columns=["Country"])

# Remove numbers and tabs from country names using regular expression
chars['Country'] = chars['Country'].apply(lambda x: re.sub(r'^\d+\t|\t', '', x).strip())


# Apply the operation to create a new column 'COUNTRIES_alias'
# If the country has a space, keep it as is (2 words). Otherwise, just return the name.
chars['Country_ALIAS'] = chars['Country'].apply(lambda x: x if ' ' not in x else x)

# Display the DataFrame
print(chars)

                              Country                     Country_ALIAS
0                        country_name                      country_name
1                         Afghanistan                       Afghanistan
2                             Albania                           Albania
3                             Algeria                           Algeria
4                             Andorra                           Andorra
..                                ...                               ...
205  Sahrawi Arab Democratic Republic  Sahrawi Arab Democratic Republic
206                        Somaliland                        Somaliland
207                     South Ossetia                     South Ossetia
208                            Taiwan                            Taiwan
209                      Transnistria                      Transnistria

[210 rows x 2 columns]


In [10]:
# Function to filter out entities not of interest

def filter_entity(ent_list, chars):
       return [ent for ent in ent_list
                  if ent in list(chars['Country_ALIAS'])]

In [11]:
df_sentences['Country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, chars))

In [12]:
# Filter out sentences that don’t have any character entities

df_sentences_filtered = df_sentences[df_sentences['Country_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,Country_entities
1220,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1225,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, Partition between In...",[Pakistan]
1234,"(^, "", The, Philippines, ,, 1898â€“1946, |, US...","[Philippines, 1898â€“1946, US House of Represe...",[Philippines]
1265,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Postcolonial Borders,...",[Afghanistan]
1302,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1364,"("", Selling, "", Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1395,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Stuck in Endless Preliminaries, Vietnam, the ...",[Vietnam]
1661,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1667,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1668,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


In [13]:
# Make a copy of the dataframe to avoid the SettingWithCopyWarning
df_sentences_filtered = df_sentences_filtered.copy()

# Now, apply the transformation safely using .loc[]
df_sentences_filtered.loc[:, 'Country_entities'] = df_sentences_filtered['Country_entities'].apply(
    lambda x: [item.split()[0] for item in x]
)

# Print the updated DataFrame to verify
df_sentences_filtered.tail(10)  # Print the first few rows of the DataFrame

Unnamed: 0,sentence,entities,Country_entities
1220,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1225,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, Partition between In...",[Pakistan]
1234,"(^, "", The, Philippines, ,, 1898â€“1946, |, US...","[Philippines, 1898â€“1946, US House of Represe...",[Philippines]
1265,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Postcolonial Borders,...",[Afghanistan]
1302,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1364,"("", Selling, "", Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1395,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Stuck in Endless Preliminaries, Vietnam, the ...",[Vietnam]
1661,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1667,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1668,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


## Creating Relationships

In [14]:
# Initialize the relationships list
relationships = []

# Loop through the DataFrame, considering each sentence and the next ones within the window size (5 sentences)
for i in range(len(df_sentences_filtered)):  # Loop through the DataFrame rows
    # Define the window's end index (i + 5, but it should not exceed the last index of the DataFrame)
    end_i = min(i + 5, len(df_sentences_filtered))

    # Extract the list of countries in the current window (from sentence i to end_i)
    Country_list = sum(df_sentences_filtered.loc[i:end_i, 'Country_entities'], [])

    # Remove consecutive duplicates from the list
    Country_unique = [Country_list[i] for i in range(len(Country_list)) 
                      if i == 0 or Country_list[i] != Country_list[i - 1]]

    # If there are at least two unique countries, create relationships between them
    if len(Country_unique) > 1:
        # Loop through the unique countries and create relationships between consecutive countries
        for idx in range(len(Country_unique) - 1):
            a = Country_unique[idx]
            b = Country_unique[idx + 1]
            relationships.append({"Source": a, "Target": b})

# Convert relationships to a DataFrame
relationships_df = pd.DataFrame(relationships)

# Display the first few rows
relationships_df.head(10)


Unnamed: 0,Source,Target
0,France,Austria
1,Austria,Hungary
2,France,Austria
3,Austria,Hungary
4,Hungary,Russia
5,France,Austria
6,Austria,Hungary
7,Hungary,Russia
8,Russia,Germany
9,Germany,Russia


In [15]:
relationship_df = pd.DataFrame(relationships)

relationship_df

Unnamed: 0,Source,Target
0,France,Austria
1,Austria,Hungary
2,France,Austria
3,Austria,Hungary
4,Hungary,Russia
...,...,...
226,Egypt,Iraq
227,Iraq,Iran
228,Egypt,Iraq
229,Iraq,Iran


In [16]:
# Sort the cases with a- >b and b- >a
relationships_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationships_df.head(5)

Unnamed: 0,Source,Target
0,Austria,France
1,Austria,Hungary
2,Austria,France
3,Austria,Hungary
4,Hungary,Russia


In [17]:
# Assuming your DataFrame has columns 'Source' and 'Target'
relationships_df["Value"] = 1  # Add a "value" column to count occurrences

# Group by 'Source' and 'Target', summing the 'value' column to count relationships
relationships_df = relationships_df.groupby(["Source", "Target"], sort=False, as_index=False).sum()

# Display the top 10 rows of the resulting DataFrame
relationships_df.head(10)


Unnamed: 0,Source,Target,Value
0,Austria,France,6
1,Austria,Hungary,6
2,Hungary,Russia,5
3,Germany,Russia,16
4,Germany,Ukraine,10
5,Austria,Germany,12
6,Germany,Italy,9
7,France,Poland,11
8,France,Germany,11
9,Germany,Poland,17


In [18]:
## Exporting to CSV

In [19]:
# Export the relationships DataFrame to a CSV file
relationships_df.to_csv('relationships.csv', index=False)

# Optional: Display a message confirming that the file was saved
print("relationships_df has been saved to 'relationships.csv'")


relationships_df has been saved to 'relationships.csv'
