# Exercise 1.6 NER Object

## 01. Imports/Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 6.7 MB/s eta 0:00:02
     ------ --------------------------------- 2.1/12.8 MB 7.3 MB/s eta 0:00:02
     ------ --------------------------------- 2.1/12.8 MB 7.3 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 3.9 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 4.5 MB/s eta 0:00:02
     ------------------ --------------------- 6.0/12.8 MB 5.0 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 4.8 MB/s eta 0:00:02
     ------------------------- -------------- 8.1/12.8 MB 5.1 MB/s eta 0:00:01
     ------------------------------ --------- 9.7/12.8 MB 5.4 MB/s eta 0:00:01
     --------------------------------- --

In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## 02. Load 20th Century Scraped Text

In [4]:
# File pathway
file_pathway = r"C:\Users\Chase\20th-century\Data Visualizations with Python\Achievement 1\02_Data\Original_Data"

In [5]:
# Import 20th century Web Scrape Text Document
df_20th_cent = file_pathway + r"\20th_century_events_web_scraped.txt"

In [6]:
# Load the text

with open(df_20th_cent, "r", encoding="utf-8", errors="ignore") as file:
    text = file.read()

#### Inspection of the Web Scrape Text
The first thing I did was continue running the code from the reading example to see how far it would get with the raw text, even though it was obvious right away that the scrape was messy. Once I opened the twentieth‑century text file, it was clear the text wasn’t clean enough to feed straight into spaCy. There were a lot of issues that would break sentence segmentation and entity extraction. From navigation text from Wikipedia, to random formatting artifacts, weird line breaks, stray punctuation, and section numbers. All of this needed to be cleaned up before spacy could work. I used Copilot to help guide me since I was unfamiliar with wrangling full sentences and wasn't sure where to start. Bellow are the steps taken to wrangle the web scrape.

## 03. Wrangling Functions

In [7]:
def clean_text(raw):
    text = raw
    text = remove_navigation(text)          
    text = normalize_whitespace(text)
    text = fix_line_breaks(text)
    text = remove_section_numbers(text)
    text = remove_formatting_artifacts(text)
    return text

#### 1. Normalize whitespace

In [8]:
def normalize_whitespace(text):
    text = text.replace("\t", " ")
    text = re.sub(r" {2,}", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    text = "\n".join(line.strip() for line in text.splitlines())
    return text.strip()

#### 2. Fix line breaks

In [9]:
def fix_line_breaks(text):
    lines = text.split("\n")
    fixed_lines = []

    for i, line in enumerate(lines):
        if i == len(lines) - 1:
            fixed_lines.append(line)
            break

        next_line = lines[i + 1]

        if (not re.search(r"[.!?]$", line.strip())) and next_line[:1].islower():
            fixed_lines.append(line + " " + next_line)
            lines[i + 1] = ""
        else:
            fixed_lines.append(line)

    fixed_text = "\n".join([l for l in fixed_lines if l.strip() != ""])
    return fixed_text

#### 3. Remove section numbers

In [10]:
def remove_section_numbers(text):
    cleaned = re.sub(r"\b\d+(\.\d+)+\b", "", text)
    cleaned = re.sub(r" {2,}", " ", cleaned)
    return cleaned.strip()

#### 4. Remove formatting artifacts

In [11]:
def remove_formatting_artifacts(text):
    text = re.sub(r"\(\s*\)", "", text)
    text = re.sub(r"\(\s*,\s*\)", "", text)
    text = re.sub(r"\(\s*", "", text)
    text = re.sub(r"\s*\)", "", text)
    text = re.sub(r"==+[^=]+==+", "", text)
    text = re.sub(r"-{2,}", " ", text)
    text = re.sub(r"•", " ", text)
    text = re.sub(r",\s*,", ", ", text)
    text = re.sub(r"\s+,", ",", text)
    text = text.replace("–", "-").replace("—", "-")
    text = re.sub(r" {2,}", " ", text)
    return text.strip()

#### 5. Remove navigation junk

In [12]:
def remove_navigation(text):
    patterns = [
        r"Search Search",
        r"WikipediaContact us",
        r"Read Edit View history",
        r"Tools",
        r"Languages",
        r"Contents",
        r"Main page",
        r"Help",
        r"Log in",
        r"Create account",
        r"Page information",
        r"Related changes",
        r"Permanent link",
        r"Printable version",
        r"In other projects",
        r"Wikidata item",
        r"Cite this page"
    ]

    for p in patterns:
        text = re.sub(p, "", text, flags=re.IGNORECASE)

    return text

## 04. Apply Wrangling + Run spaCy

In [13]:
cleaned_text = clean_text(text)
doc = NER(cleaned_text)

In [28]:
# Save cleaned text
output_path = r"C:\Users\Chase\20th-century\Data Visualizations with Python\Achievement 1\02_Data\Prepared_Data\20th_century_cleaned.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

## 05. Visualize Entities

In [14]:
displacy.render(doc[273:20000], style="ent", jupyter=True)

## 06. Get named entity list per sentence

In [15]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [16]:
# Work check
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - Wikipedia\nJump, Navigatio..."
1,"(The, war, in, Europe, \n\n, Blitzkrieg, \n\n,...","[Europe, days, the Pacific\n\nBackground, days..."
2,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,..."
3,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
4,"(Historic, events, in, the, 20th, century[edit...","[Historic, 20th, 1914]"
5,"(The, new, beginning, of, the, 20th, century, ...",[the 20th century]
6,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
7,"(1914, saw, the, completion, of, the, Panama, ...","[1914, the Panama Canal]"
8,"(The, Scramble, for, Africa, continued, in, th...","[Scramble, Africa, the 1900s]"
9,"(The, atrocities, in, the, Congo, Free, State,...",[the Congo Free State]


In [17]:
print(cleaned_text[:500])

Key events of the 20th century - Wikipedia
Jump to content
Main menu
Main menu move to sidebar
 hide
Navigation
Current eventsRandom articleAbout
Contribute
Learn to editCommunity portalRecent changesUpload fileSpecial pages
Search
Search
Appearance
Donate
Personal
Donate move to sidebar
 hide
Top
1
Historic events in the 20th century
Toggle Historic events in the 20th century subsection

World at the beginning of the century

"The war to end all wars": World War I 1914-1918

Spanish flu

Russia


## 07. List of Countries

In [18]:
# Import country list
df_countries = pd.read_csv(
    r"C:\Users\Chase\20th-century\Data Visualizations with Python\Achievement 1\02_Data\Original_Data\countries_list_20th_century_1.5.csv",
    header=None,
    dtype=str,
    skiprows=1,                  # skip the header row that's in row 0
    names=["id", "country_name"] # set proper column names
)

print(df_countries.head())
print(df_countries.shape)


  id   country_name
0  1   Afghanistan 
1  2       Albania 
2  3       Algeria 
3  4       Andorra 
4  5        Angola 
(209, 2)


In [19]:
# Clean country names (strip spaces, lowercase)
df_countries["country_name"] = df_countries["country_name"].str.strip().str.lower()

In [20]:
# Create the actual list you'll use for filtering
country_list = df_countries["country_name"].tolist()

In [21]:
df_countries.head()

Unnamed: 0,id,country_name
0,1,afghanistan
1,2,albania
2,3,algeria
3,4,andorra
4,5,angola


## 08. Filtering Entities

In [22]:
# Function to filter out entities not of interest

def filter_country(ent_list, country_list):
    return [ent for ent in ent_list if ent.lower() in country_list]

In [23]:
# Work check, used Copolit's example to check.
filter_country(["Germany", "banana", "United States"], country_list)

['Germany', 'United States']

In [24]:
df_sentences['country_entities'] = df_sentences['entities'].apply(
    lambda x: filter_country(x, country_list)
)

In [25]:
df_sentences['country_entities'].head(20)

0                            []
1                            []
2                            []
3                            []
4                            []
5                            []
6                            []
7                            []
8                            []
9                            []
10                           []
11                           []
12                           []
13    [France, Austria, Russia]
14            [Germany, Russia]
15                    [Germany]
16                    [Germany]
17                           []
18                           []
19                    [Austria]
Name: country_entities, dtype: object

In [26]:
all_entities = set([ent.text for ent in doc.ents])
all_entities

{'Wylie',
 'a few hundred yards',
 'Air',
 'Linge',
 '1929',
 'Moldovans',
 '1968',
 'Online-Ausg',
 'several crucial weeks',
 'Hungary',
 'the Pacific Islands',
 'Siegelbaum',
 '19 February 1942',
 '4-Year',
 'The Nazi Party',
 'The 1980s',
 'BIOS',
 'October 2011',
 'Gorbachev',
 'Kamikaze',
 "New Year's Day 2001",
 'U.S. Department of Energy Office of Scientific and Technical Information',
 'Wikipedia®',
 '18008258',
 'Harper Collins',
 '111',
 '24 July 2016',
 '652',
 '28',
 'Armistice',
 'War Crimes and Genocide',
 'Tai-Wei',
 '9 September',
 'the Warsaw Pact',
 'Jomo Kenyatta',
 '1940:',
 '1937 to 1942',
 '18 March 2023',
 'Suk-Jung',
 'Historic UK',
 'New York',
 '12 May 2017',
 'Wikidata',
 'Skylab',
 'Heinz',
 'Atomic Heritage Foundation',
 'Stuck in Endless Preliminaries',
 'over 6,800',
 '25 August 2011',
 'the 20th Century',
 'August 2018',
 'Marines',
 '605',
 '55',
 'Tsarist',
 'USSR Launches Sputnik',
 'anti-American',
 'December 2007',
 'Smithsonian',
 'Koreas',
 '21 Se

In [27]:
[e for e in all_entities if e.lower() in country_list]

['Hungary',
 'Poland',
 'Germany',
 'France',
 'Ghana',
 'Luxembourg',
 'Belgium',
 'Bulgaria',
 'Philippines',
 'Afghanistan',
 'Austria',
 'Israel',
 'Albania',
 'South Africa',
 'Kenya',
 'Pakistan',
 'Lithuania',
 'Seychelles',
 'Finland',
 'Vietnam',
 'Latvia',
 'Australia',
 'Belarus',
 'United States',
 'Singapore',
 'Sweden',
 'India',
 'Libya',
 'Russia',
 'Solomon Islands',
 'Estonia',
 'Lebanon',
 'Iran',
 'Mongolia',
 'Morocco',
 'Netherlands',
 'Denmark',
 'Greece',
 'Japan',
 'Romania',
 'Iraq',
 'Algeria',
 'Ukraine',
 'Laos',
 'Canada',
 'Papua New Guinea',
 'Cuba',
 'Norway',
 'Spain',
 'Thailand',
 'Cambodia',
 'Italy',
 'Rwanda',
 'Egypt']

## 09. Create relationships

In [29]:
# Create the relationships dataframe

relationships = []

for countries in df_sentences['country_entities']:
    # Only consider sentences with 2 or more countries
    if len(countries) > 1:
        # Create all pairwise combinations
        for i in range(len(countries) - 1):
            for j in range(i + 1, len(countries)):
                relationships.append({
                    "source": countries[i],
                    "target": countries[j]
                })

In [30]:
# Convert to dataframe
relationship_df = pd.DataFrame(relationships)

In [31]:
relationship_df.head()

Unnamed: 0,source,target
0,France,Austria
1,France,Russia
2,Austria,Russia
3,Germany,Russia
4,Germany,Italy


In [32]:
#Sort pairs so A–B and B–A match
relationship_df = pd.DataFrame(
    np.sort(relationship_df.values, axis=1),
    columns=relationship_df.columns
)

In [33]:
# Summarize interactions (count co‑occurrences)
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source", "target"], as_index=False).sum()

In [35]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,Albania,Bulgaria,1
1,Albania,Greece,2
2,Albania,Hungary,1
3,Albania,Poland,1
4,Albania,Romania,1
5,Algeria,Morocco,1
6,Australia,Canada,1
7,Austria,Austria,1
8,Austria,France,1
9,Austria,Germany,2


In [36]:
# Save the relationships
output_path = r"C:\Users\Chase\20th-century\Data Visualizations with Python\Achievement 1\02_Data\Prepared_Data\country_relationships.csv"
relationship_df.to_csv(output_path, index=False)