## Imports

In [2]:
import pandas as pd
import re
import csv
import string
import math

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000 



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ftzavellos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ftzavellos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ftzavellos/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Read the full dataset

In [None]:
df_dataset = pd.read_csv('dataset.csv', index_col=0, encoding = 'utf-8')
df_dataset

## Clean up the facts and full_text columns 

In [4]:
def clean_text(text):
    # Remove markdown elements and special characters
    text = re.sub(r'#', '', text)  # Remove '###'
    text = re.sub(r'[-]', '', text)  # Remove '-'
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\\[a-zA-Z0-9]+', '', text)  # Remove any escaped sequences like \xa0
    return text

In [None]:
df_dataset['facts'] = df_dataset['facts'].apply(clean_text)
df_dataset['full_text'] = df_dataset['full_text'].apply(clean_text)
df_dataset

## Extract named entities from facts

DO NOT run unless you want to redo the process, otherwise reading the named_entities.csv would suffice

In [None]:
facts = df_dataset['facts']
facts

In [None]:
facts_list = facts.tolist()
facts_list = [str(fact) for fact in facts_list]
facts_list

In [15]:
# Extract named entities from facts_list
entities = [ent.text for doc in map(nlp, facts_list) for ent in doc.ents]

# Get the unique values from the entities list
entities_list = list(dict.fromkeys(entities))

# Save the list of entities
with open('named_entities_facts.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    for ent in entities_list:
        writer.writerow([ent])

## Read named_entities_facts.csv

In [1]:
entities_facts  = pd.read_csv('named_entities_facts.csv', header = None, names = ['facts'], encoding = 'utf-8')
entities_list_facts = entities_facts['facts'].tolist()

NameError: name 'pd' is not defined

In [7]:
nan_values_f = entities_facts['facts'].isna()
entities_facts[nan_values_f]

Unnamed: 0,facts
57064,


In [8]:
entities_list_facts = [str(element) for element in entities_list_facts]
entities_list_facts = [x for x in entities_list_facts if x.lower() != "nan"]
print(sum(1 for x in entities_list_facts if x.lower() == "nan"))

0


## Remove named entities from facts (creating new columns in the df_dataset)

In [10]:
def remove_named_entities(text, entities_list):
    for entity in entities_list:
        # Use word boundaries to ensure whole phrases are matched
        entity_regex = re.compile(r'\b' + re.escape(entity) + r'\b', re.IGNORECASE)
        text = entity_regex.sub('', text)
    return text.strip()


In [11]:
df_dataset['facts_ne_removed'] = df_dataset['facts'].apply(lambda x: remove_named_entities(x, entities_list_facts))

In [None]:
df_dataset

In [14]:
df_dataset['bertsum_ne_removed'] = df_dataset['bert_summary'].apply(lambda x: remove_named_entities(x, entities_list_facts))
df_dataset['textrank_ne_removed'] = df_dataset['textrank_summary'].apply(lambda x: remove_named_entities(x, entities_list_facts))

## Stopwords

In [16]:
all_stopwords = stopwords.words('english')
all_stopwords.extend([
'also',
'may', 
'could', 
'would', 
'must', 
'applicant', 
'applicants'
'court',
'article',
'case',
'convetion',
'see',
'right',
'government',
'paragraph',
'law',
'state',
'detention',
'authority',
'application',
'one'])

In [17]:
def remove_stopwords(text, stopwords_list):
    for entity in stopwords_list:
        # Use word boundaries to ensure whole phrases are matched
        stopword_regex = re.compile(r'\b' + re.escape(entity) + r'\b', re.IGNORECASE)
        text = stopword_regex.sub('', text)
    return text.strip()

In [18]:
# remove stopwords from named_entities_removed columns
df_dataset['cleaned_facts'] = df_dataset['facts_ne_removed'].apply(lambda x: remove_stopwords(x, all_stopwords))
df_dataset['cleaned_bertsum'] = df_dataset['bertsum_ne_removed'].apply(lambda x: remove_stopwords(x, all_stopwords))
df_dataset['cleaned_textrank'] = df_dataset['textrank_ne_removed'].apply(lambda x: remove_stopwords(x, all_stopwords))

## Remove Verbs and Adjectives

In [21]:
def remove_tags(text, pos_tags):
    
    words = nltk.word_tokenize(text)
    tagged_words = nltk.pos_tag(words)

    filtered_words = [word for word, tag in tagged_words if not any(tag.startswith(pos_tag) for pos_tag in pos_tags)]
    return ' '.join(filtered_words)

In [22]:
def remove_pos(text, remove_verbs=False, remove_adjectives=False):
    
    pos_tags_to_remove = []
    if remove_verbs:
        pos_tags_to_remove.append('VB')
    if remove_adjectives:
        pos_tags_to_remove.append('JJ')

    if pos_tags_to_remove:
        text = remove_tags(text, pos_tags_to_remove)
    
    return text.strip()

In [25]:
df_dataset['cleaned_facts_pos_removed'] = df_dataset['cleaned_facts'].apply(lambda x: remove_pos(x, True, True))

In [29]:
df_dataset['cleaned_bertsum_pos_removed'] = df_dataset['cleaned_bertsum'].apply(lambda x: remove_pos(x, True, True))
df_dataset['cleaned_textrank_pos_removed'] = df_dataset['cleaned_textrank'].apply(lambda x: remove_pos(x, True, True))

In [30]:
df_dataset.to_csv('df_dataset.csv')

In [32]:
df_dataset['cleaned_facts'].to_csv('df_dataset_cleaned_facts.csv')
df_dataset['cleaned_bertsum'].to_csv('df_dataset_cleaned_bertsum.csv')
df_dataset['cleaned_facts_pos_removed'].to_csv('df_dataset_cleaned_facts_pos_removed.csv')
df_dataset['cleaned_bertsum_pos_removed'].to_csv('df_dataset_cleaned_bertsum_pos_removed.csv')