In [1]:
# Install the Stanza library for NLP
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [3]:
# Import required libraries
import os
import re
import stanza


In [4]:
# Download the English language model for Stanza
stanza.download("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


In [5]:
# Create the pipeline, specifying the language:
nlp = stanza.Pipeline(lang="en", processors="tokenize,mwt,ner")

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [6]:
# Clone the GitHub repository containing your article corpus
!git clone https://github.com/Aqsa-2004/FASDH25-portfolio2.git

Cloning into 'FASDH25-portfolio2'...
remote: Enumerating objects: 4413, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 4413 (delta 17), reused 16 (delta 12), pack-reused 4381 (from 2)[K
Receiving objects: 100% (4413/4413), 20.59 MiB | 26.45 MiB/s, done.
Resolving deltas: 100% (27/27), done.


In [11]:
import os

# Create a dictionary to store place name counts
places = {}

# Folder path to your cloned repository's articles
folder = "/content/FASDH25-portfolio2/articles"


In [13]:
# Loop through files that begin with "2024-01-"
for filename in os.listdir(folder):
    if "2024-01-" in filename:
  # create a path to the file:
      path = f"{folder}/{filename}"
  # open and read the file:
      with open(path, encoding="utf-8") as file:
          text = file.read()


In [19]:
# use the nlp pipeline to analyse the text:
doc = nlp(text)
# select only the entities that are place names:
for e in doc.entities:
        if e.type in ["GPE", "LOC"]:

        # add 1 to the count of the place in our dictionary
        # (and/or add the place to the dictionary if it was not there yet):
         places[e.text] = places.get(e.text, 0) +1
print(places)

{'Morocco': 13, 'Israel': 116, 'Gaza': 100, 'Rabat': 3, 'United States': 6, 'the United Arab Emirates': 2, 'UAE': 3, 'Bahrain': 1, 'Sudan': 1, 'US': 53, 'Western Sahara': 3, 'Washington': 6, 'Tel Aviv': 6, 'Algeria': 2, 'Marrakesh': 1, 'the Western Sahara': 1, 'Morocco’s': 1, 'Maghreb': 1, 'Ukraine': 1, 'Saudi Arabia': 2, 'California': 1, 'West Bank': 8, 'Dena': 1, 'Israel’s': 1, 'Oakland': 1, 'the United States': 7, 'South Africa': 29, 'Jordan': 2, 'Jerusalem': 2, 'East Jerusalem': 2, 'Egypt': 3, 'Qatar': 5, 'Kuala Lumpur': 2, 'Malaysia': 2, 'Palestine': 7, 'Indonesia’s': 1, 'Jakarta': 1, 'Johannesburg': 1, 'London': 5, 'Paris': 2, 'Vienna': 1, 'Berlin': 2, 'Amman': 1, 'Washington DC': 2, 'UK': 2, 'Manchester': 1, 'Yemen': 11, 'Washington, DC': 1, 'India': 1, 'Hyderabad': 1, 'Colombo’s Kollupitiya': 1, 'Namibia': 8, 'Germany': 9, 'Palestinian Territories': 1, 'Sweden': 1, 'Iran': 9, 'Kerman': 2, 'Lebanon': 12, 'Bethlehem': 3, 'Nairoukh': 1, 'China': 2, 'Italy': 1, 'Spain': 1, 'Turkey'

In [20]:
import re

normalized_places = {}

for place, count in places.items():
    # Step 1: Remove possessives like 's
    place = re.sub(r"[’'`]s\b", "", place)

    # Step 2: Remove punctuation
    place = re.sub(r"[^\w\s]", "", place)

    # Step 3: Remove leading 'the' if it appears (to handle "The United States" and "United States")
    place = re.sub(r"^the\s+", "", place, flags=re.IGNORECASE)  # case-insensitive removal of "The"

    # Step 4: Merge counts for normalized places
    if place in normalized_places:
        normalized_places[place] += count
    else:
        normalized_places[place] = count

# Print the cleaned and aggregated place names with counts
print(normalized_places)

{'Morocco': 14, 'Israel': 117, 'Gaza': 101, 'Rabat': 3, 'United States': 13, 'United Arab Emirates': 2, 'UAE': 3, 'Bahrain': 1, 'Sudan': 1, 'US': 57, 'Western Sahara': 4, 'Washington': 7, 'Tel Aviv': 6, 'Algeria': 2, 'Marrakesh': 1, 'Maghreb': 1, 'Ukraine': 1, 'Saudi Arabia': 2, 'California': 1, 'West Bank': 8, 'Dena': 1, 'Oakland': 1, 'South Africa': 30, 'Jordan': 2, 'Jerusalem': 2, 'East Jerusalem': 2, 'Egypt': 3, 'Qatar': 5, 'Kuala Lumpur': 2, 'Malaysia': 2, 'Palestine': 7, 'Indonesia': 1, 'Jakarta': 1, 'Johannesburg': 1, 'London': 5, 'Paris': 2, 'Vienna': 1, 'Berlin': 2, 'Amman': 1, 'Washington DC': 3, 'UK': 2, 'Manchester': 1, 'Yemen': 11, 'India': 1, 'Hyderabad': 1, 'Colombo Kollupitiya': 1, 'Namibia': 8, 'Germany': 9, 'Palestinian Territories': 1, 'Sweden': 2, 'Iran': 9, 'Kerman': 2, 'Lebanon': 12, 'Bethlehem': 3, 'Nairoukh': 1, 'China': 2, 'Italy': 1, 'Spain': 1, 'Turkey': 12, 'Shawawra': 1, 'Hague': 4, 'Gaza Strip': 6, 'Khan Younis': 1, 'Syria': 2, 'Mazzeh': 1, 'Damascus': 2, 

In [23]:
# Define the name and path of the output file
filename = "/content/FASDH25-portfolio2/ner_counts.tsv"

# Open the file in writing mode using UTF-8 encoding
with open(filename, mode="w", encoding="utf-8") as file:
    # Create the header line: column names separated by a tab
    header = "Place\tCount\n"
    file.write(header)

    # Loop through the places dictionary (cleaned place names and their counts)
    for place, count in places.items():
        # Create a row with the place and count separated by a tab
        row = f"{place}\t{count}\n"
        file.write(row)

In [39]:
from google.colab import files
# Define the filename for output
filename = "/content/ner_counts.tsv"
# Write to TSV
with open(filename, mode="w", encoding="utf-8") as file:
    file.write("Place\tCount\n")
    for place, count in normalized_places.items():
        file.write(f"{place}\t{count}\n")

with open(filename, encoding="utf-8") as file:
  print(file.read())
files.download(filename)

Place	Count
Morocco	14
Israel	117
Gaza	101
Rabat	3
United States	13
United Arab Emirates	2
UAE	3
Bahrain	1
Sudan	1
US	57
Western Sahara	4
Washington	7
Tel Aviv	6
Algeria	2
Marrakesh	1
Maghreb	1
Ukraine	1
Saudi Arabia	2
California	1
West Bank	8
Dena	1
Oakland	1
South Africa	30
Jordan	2
Jerusalem	2
East Jerusalem	2
Egypt	3
Qatar	5
Kuala Lumpur	2
Malaysia	2
Palestine	7
Indonesia	1
Jakarta	1
Johannesburg	1
London	5
Paris	2
Vienna	1
Berlin	2
Amman	1
Washington DC	3
UK	2
Manchester	1
Yemen	11
India	1
Hyderabad	1
Colombo Kollupitiya	1
Namibia	8
Germany	9
Palestinian Territories	1
Sweden	2
Iran	9
Kerman	2
Lebanon	12
Bethlehem	3
Nairoukh	1
China	2
Italy	1
Spain	1
Turkey	12
Shawawra	1
Hague	4
Gaza Strip	6
Khan Younis	1
Syria	2
Mazzeh	1
Damascus	2
Houthis	1
Red Sea	14
BabelMandeb Strait	1
Gulf of Aden	2
Sanaa	1
United Kingdom	2
Hodeidah	1
Taiz	1
Dhamar	1
alBayda	1
Saada	1
Arabian Sea	1
Bab alMandeb Strait	1
Asia	1
Europe	2
Kuwait	1
Middle East	5
Ankara	7
West	1
Tehran	1
South Carolina	2
New York 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>