In [2]:
# Install the Stanza library for NLP
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [3]:
# Import required libraries
import os
import re
import stanza


In [4]:
# Download the English language model for Stanza
stanza.download("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


In [8]:
# Create the pipeline, specifying the language:
nlp = stanza.Pipeline(lang="en", processors="tokenize,mwt,ner")

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [9]:
# Clone the GitHub repository containing your article corpus
!git clone https://github.com/Aqsa-2004/FASDH25-portfolio2.git

fatal: destination path 'FASDH25-portfolio2' already exists and is not an empty directory.


In [15]:
# Set the path to the folder where the articles are stored
corpus_folder = "/content/FASDH25-portfolio2/articles"

# Initialize an empty list to store article filenames
january_2024_files = []


In [25]:
# Loop through all files in the corpus folder
for file in os.listdir(corpus_folder):
  # Check if the filename contains "2024-01"
  if "2024-01" in file:
    # If yes, add full path to the list
    january_2024_files.append(f"{corpus_folder}/{file}")
# Dictionary to store counts of each place name
place_name_counts = {}


In [26]:
path = os.path.join(corpus_folder, filename)
# Open and read the text from the file
with open (path, encoding="utf-8") as file:
  text = file.read()


In [75]:
# Apply the Stanza NLP pipeline to the text
doc = nlp(text)

# Loop through each sentence in the processed document
for sentence in doc.sentences:
  # Loop through each recognized named entity
  for entity in sentence.ents:
    # Check if the entity is a place: GPE (Geo-political entity), LOC (location), or FAC (facility)
    if entity.type == "GPE" or entity.type == "LOC" or entity.type == "FAC":
      place = entity.text
      # If place already in the dictionary, increment its count
      if place in place_name_counts:
        place_name_counts[place] += 1
      else:
        # Otherwise, initialize the count
        place_name_counts[place] = 1
# Print the raw place count dictionary
print(place_name_counts)



{'Gaza': 27, 'Bethlehem': 21, 'West Bank': 3, 'Bethlehem’s SOS Children’s Village': 3, 'Rafah': 21, 'the Gaza Strip': 3, 'Israel': 3, 'Rafah’s SOS Children’s Village': 3, 'The Rafah SOS Village': 3, 'Palestine': 3, 'the Bethlehem Village': 3, 'SOS Children’s Villages': 3}


In [81]:
# Initialize new dictionary to store normalized place names
normal_places = {}
# Loop through each place in the original dictionary
for place in place_name_counts:
  clean_place = place.strip().lower().replace("’s", "").replace("'s", "").rstrip("’s").rstrip("s")
  clean_place = clean_place.title()
  if clean_place in normal_places:
    normal_places[clean_place] += place_name_counts[place]
  else:
    normal_places[clean_place] = place_name_counts[place]

In [79]:
# Combine counts for normalized names
if clean_place in normal_places:
  normal_places[clean_place] += place_name_counts[place]
else:
  normal_places[clean_place] = place_name_counts[place]
print(clean_place)

Sos Children Village


In [55]:
# Convert the dictionary to a list of lists for sorting
items = []
for place in normal_places:
    items.append([place, normal_places[place]])

# Print the normalized place counts
print(normal_places)


{'SO': 2}


In [57]:
# Sort items in descending order by count
for i in range(len(items)):
  for j in range(i + 1, len(items)):
    if items[j][1] > items[i][1]:
      items[i], items[j] = items[j], items[i]

In [73]:
# Define the filename for output
filename = "ner_counts.tsv"

# Write place counts to a TSV file
with open(filename, mode="w", encoding="utf-8") as file:
    header = "Place\tCount\n"
    file.write(header)
    for place, count in place_name_counts.items():
        file.write(f"{place}\t{count}\n")

In [82]:
# Open and print the contents of the newly written TSV file
with open("/content/ner_counts.tsv", encoding="utf-8") as file:
  print(file.read())

Place	Count
Gaza	18
Bethlehem	14
West Bank	2
Bethlehem’s SOS Children’s Village	2
Rafah	14
the Gaza Strip	2
Israel	2
Rafah’s SOS Children’s Village	2
The Rafah SOS Village	2
Palestine	2
the Bethlehem Village	2
SOS Children’s Villages	2

