### CAP 6640 
### Project 1 - Extractive Summarization
### Feb 8, 2024

### Group 4
### Andres Graterol
###                   UCF ID: 4031393
### Zachary Lyons
###                   UCF ID: 4226832
### Christopher Hinkle
###                   UCF ID: 4038573
### Nicolas Leocadio
###                   UCF ID: 3791733

In [46]:
import string 
import nltk 
import re 
import numpy as np
import networkx as nx
import csv

from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, LsiModel
from gensim import corpora
from scipy import spatial

# Download necessary resources from nltk
nltk.download('punkt')
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tmp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tmp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Method 1 - TextRank

#### Step 1 - Data Collection

In [31]:
# Gather lengthy articles or a collection of documents that all relate to the same topic (i.e. documents covering an earthquake)
# TextRank: Single-document summarization

'''
    Input: File path to a text file
    Output: String of the text file
'''
def txt_file_to_string(filepath):
    with open(filepath, 'r', encoding='utf8') as file:
        data = file.read()
        data = data.replace('\n', ' ') # Remove newline characters
    return data

# Data is located in text format, character escaped, inside the Documents folder
# TODO: This is a very short sample document to test functionality. When we confirm this works, lets use a larger document.
document_filepath = 'Documents/Japanese_Earthquake-NationalGeographic.txt'
document_text = txt_file_to_string(document_filepath)
print(document_text)

#### Step 2 - Data Preprocessing

In [32]:
# TextRank: remove punctuation, tokenize, and remove stopwords

'''
    Purpose: Perform appropriate preprocessing on the text file for the TextRank algorithm
'''
def preprocess_text(text, stop_words):
    tokenized_sentences = sent_tokenize(text, language='english')

    sentences_to_lower = [sentence.lower() for sentence in tokenized_sentences]

    # Regular Expression to match any punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    # Remove the punctuation from the lowercase sentences
    sentences_no_punctuation = [regex.sub('', sentence) for sentence in sentences_to_lower]

    data = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_no_punctuation]
    return data, tokenized_sentences

# Obtain stopwords from nltk
stop_words = set(stopwords.words('english'))
# Preprocess the text to obtain the data we will use going forward
data, tokenized_sentences = preprocess_text(document_text, stop_words)
print(data)

[['march', '11', '2011', 'japan', 'experienced', 'strongest', 'earthquake', 'recorded', 'history'], ['earthquake', 'struck', 'north', 'pacific', '130', 'kilometers', '81', 'miles', 'east', 'sendai', 'largest', 'city', 'tohoku', 'region', 'northern', 'part', 'island', 'honshu'], ['tohoku', 'earthquake', 'caused', 'tsunami'], ['tsunami—japanese', '“harbor', 'wave”—is', 'series', 'powerful', 'waves', 'caused', 'displacement', 'large', 'body', 'water'], ['tsunamis', 'like', 'one', 'formed', 'tohoku', 'triggered', 'underwater', 'tectonic', 'activity', 'earthquakes', 'volcanic', 'eruptions'], ['tohoku', 'tsunami', 'produced', 'waves', '40', 'meters', '132', 'feet', 'high', '450000', 'people', 'became', 'homeless', 'result', 'tsunami'], ['15500', 'people', 'died'], ['tsunami', 'also', 'severely', 'crippled', 'infrastructure', 'country'], ['addition', 'thousands', 'destroyed', 'homes', 'businesses', 'roads', 'railways', 'tsunami', 'caused', 'meltdown', 'three', 'nuclear', 'reactors', 'fukushim

#### Step 3 - Feature Engineering

In [33]:
# TextRank: Word Embeddings 
 
# Grab the maximum number of words in a sentence for padding sentence embeddings
max_sentence_length = max([len(sentence) for sentence in data])

'''
    Train the Word2Vec model on the data and calculate embeddings for each word
        min_count: Ignores all words with total frequency lower than this
        vector_size: Dimensionality of the word vectors
'''
# NOTE: If output is unsatsifactory, train for longer epochs
model = Word2Vec(data, min_count=1, vector_size=1, epochs=5000)

# Grab sentence embeddings by leveraging the word embeddings and sentence tokens
sentence_embeddings = [[model.wv[word][0] for word in words] for words in data]

# Pad the sentence embeddings with 0's to ensure all sentences have the same length
sentence_embeddings = [np.pad(embedding, (0, max_sentence_length - len(embedding)), 'constant') for embedding in sentence_embeddings]

# Calculate the similarity matrix
# Instantiate a matrix of zeros with the same shape as the number of sentences
similarity_matrix = np.zeros([len(data), len(data)])

# Populate the similarity matrix with cosine similarity scores (same as 1 - cosine distance)
for i, row in enumerate(sentence_embeddings):
    for j, col in enumerate(sentence_embeddings):
        similarity_matrix[i][j] = 1 - spatial.distance.cosine(row, col)

print(similarity_matrix)


[[0.99999999 0.70977305 0.6788758  0.90541902 0.85618414 0.77903019
  0.58480879 0.82346425 0.70122287 0.81078446]
 [0.70977305 0.99999999 0.46256435 0.78552505 0.81960516 0.91196754
  0.39324753 0.57507588 0.99913895 0.88168264]
 [0.6788758  0.46256435 0.99999998 0.61504091 0.56159318 0.52088329
  0.8675951  0.81588685 0.47054226 0.54896826]
 [0.90541902 0.78552505 0.61504091 1.         0.95161855 0.85485302
  0.52866716 0.74685248 0.77230259 0.89089428]
 [0.85618414 0.81960516 0.56159318 0.95161855 0.99999998 0.89274893
  0.48374692 0.69104607 0.81084365 0.92575996]
 [0.77903019 0.91196754 0.52088329 0.85485302 0.89274893 0.99999998
  0.44024469 0.63624942 0.90966395 0.96622476]
 [0.58480879 0.39324753 0.8675951  0.52866716 0.48374692 0.44024469
  0.99999999 0.69989115 0.40420296 0.47373378]
 [0.82346425 0.57507588 0.81588685 0.74685248 0.69104607 0.63624942
  0.69989115 0.99999999 0.57367478 0.66818031]
 [0.70122287 0.99913895 0.47054226 0.77230259 0.81084365 0.90966395
  0.40420296

#### Step 4 - Algorithm and Results


In [34]:
# TextRank: Call nx's pagerank to get scores. 

''' 
    Get the top n sentences from pagerank scores
'''
def top_n_sentences(n, scores, tokenized_sentences):
    # Key => Sentence 
    # Value => PageRank Score
    sentence_score_dict = {sentence:scores[i] for i, sentence in enumerate(tokenized_sentences)}

    # Filter the dictionary to contain only the top n sentences
    top_sentences = dict(sorted(sentence_score_dict.items(), key=lambda item: item[1], reverse=True)[:n])

    return top_sentences

# Convert similarity matrix to an nx graph and call nx's pagerank
graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(graph)

# NOTE: Modify this variable to change the number of sentences in the summary
num_sent_to_extract = 4

extractive_summary = top_n_sentences(num_sent_to_extract, scores, tokenized_sentences)

# Iterate through the dictionary to output the summary
for sentence, score in extractive_summary.items():
    print(sentence)



#### Last Step - Evaluation

In [38]:
# NOTE: Evaluation will depend on the method used to implement extractive summarization
#       - ILP (Integer Linear Programming): We can use ROUGE-2 for evaluation
# Andres NOTE: This is the only section that I am unsure of. It would be cool to use ROUGE-2 to compare our TextRank algorithm to the bigram inspection


def csv_column_to_list(file_path, column_index):
    column_data = []
    with open(file_path, encoding="utf8") as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if len(row) > column_index:  # Ensure the row has the desired column
                column_data.append(row[column_index].replace("\n"," "))

    return column_data

csvFile = "./Dataset/CnnTestData.csv"

# Get the list of articles and human summaries that we are going to be evaluating
testDocs = csv_column_to_list(csvFile,1)
testDocs = testDocs[1:21]

humanSumm = csv_column_to_list(csvFile,2)
humanSumm = humanSumm[1:21]

In [44]:
# Get our models summarizations of the documents
modelSumms = []

for doc in testDocs:
    data, tokenized_sentences = preprocess_text(doc, stop_words)
    max_sentence_length = max([len(sentence) for sentence in data])
    model = Word2Vec(data, min_count=1, vector_size=1, epochs=5000)

    # Grab sentence embeddings by leveraging the word embeddings and sentence tokens
    sentence_embeddings = [[model.wv[word][0] for word in words] for words in data]

    # Pad the sentence embeddings with 0's to ensure all sentences have the same length
    sentence_embeddings = [np.pad(embedding, (0, max_sentence_length - len(embedding)), 'constant') for embedding in sentence_embeddings]

    # Calculate the similarity matrix
    # Instantiate a matrix of zeros with the same shape as the number of sentences
    similarity_matrix = np.zeros([len(data), len(data)])

    # Populate the similarity matrix with cosine similarity scores (same as 1 - cosine distance)
    for i, row in enumerate(sentence_embeddings):
        for j, col in enumerate(sentence_embeddings):
            similarity_matrix[i][j] = 1 - spatial.distance.cosine(row, col)

    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)
    # NOTE: Modify this variable to change the number of sentences in the summary
    num_sent_to_extract = 4

    extractive_summary = top_n_sentences(num_sent_to_extract, scores, tokenized_sentences)

    # Iterate through the dictionary to output the summary
    s = ""
    for sentence, score in extractive_summary.items():
        s = s + sentence
    
    modelSumms.append(s)

print(modelSumms[0])


Pakistan's Misbah-ul-Haq (left) and Wahab Riaz look set to play international cricket in their homeland again .Security officials display arms and ammunition seized after the terrorists' attack on the Sri Lanka team .Sri Lankan cricketers were rescued by the Pakistani air force from Gadaffi Stadium after the terror attack .Pakistan appear set to host Test-playing opposition in their home country for the first time in more than six years.


In [59]:
# Now that we have our models summaries we can compare them to our Human made ones using Rouge
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

allScores = []
for i in range(len(modelSumms)):
    score = scorer.score(target=humanSumm[i],prediction=modelSumms[i])
    r1fscore = score['rouge1'].fmeasure
    r2fscore = score['rouge2'].fmeasure
    rLfscore = score['rougeL'].fmeasure
    allScores.append([r1fscore,r2fscore,rLfscore])

# List of F-scores in the order ['rouge1', 'rouge2', 'rougeL']
print(allScores)



[[0.3114754098360656, 0.03333333333333333, 0.1639344262295082], [0.25, 0.07936507936507936, 0.18750000000000003], [0.34532374100719426, 0.10218978102189781, 0.2158273381294964], [0.1806451612903226, 0.0261437908496732, 0.0903225806451613], [0.26277372262773724, 0.02962962962962963, 0.13138686131386862], [0.34328358208955223, 0.10606060606060606, 0.23880597014925375], [0.3793103448275862, 0.2280701754385965, 0.2586206896551724], [0.3783783783783784, 0.09589041095890412, 0.22972972972972974], [0.2096774193548387, 0.06557377049180328, 0.12903225806451613], [0.271604938271605, 0.05, 0.1358024691358025], [0.411764705882353, 0.04477611940298507, 0.17647058823529413], [0.359375, 0.07936507936507937, 0.203125], [0.2987012987012987, 0.09210526315789473, 0.18181818181818182], [0.1889763779527559, 0.015999999999999997, 0.09448818897637795], [0.2967741935483871, 0.05228758169934641, 0.15483870967741936], [0.3595505617977528, 0.11363636363636365, 0.2247191011235955], [0.24778761061946902, 0.0360360

Pakistan's Misbah-ul-Haq (left) and Wahab Riaz look set to play international cricket in their homeland again .Security officials display arms and ammunition seized after the terrorists' attack on the Sri Lanka team .Sri Lankan cricketers were rescued by the Pakistani air force from Gadaffi Stadium after the terror attack .Pakistan appear set to host Test-playing opposition in their home country for the first time in more than six years.
Zimbabwe have reportedly agreed to visit Pakistan for ODI series in May .Pakistan have not played host to major cricket series since 2009 .There have been security fears since Sri Lanka were victims of terror attack .Team bus was targeted by gunmen in Lahore, and eight people were killed .
Pakistan appear set to host Test-playing opposition in their home country for the first time in more than six years. Zimbabwe have reportedly agreed to travel for a short one-day international series next month, likely to take place in Lahore and Karachi. No tourists

### Method 2 - Latent Semantic Indexing (LSI)

#### Step 1 - Data Collection

In [None]:
# Gather lengthy articles or a collection of documents that all relate to the same topic (i.e. documents covering an earthquake)
# LSI (Latent Sentiment Indexing): Multi-document summarization
# Gather lengthy articles or a collection of documents that all relate to the same topic (i.e. documents covering an earthquake)
# TextRank: Single-document summarization

'''
    Input: File path to multiple text files
    Output: List of multiple text
'''
def txt_files_to_string(filepaths) -> list[list[str]]:
    i = 0
    document_list = []
    for file in filepaths:
        with open(file, 'r', encoding='utf8') as file:
            data = file.read()
            data = data.replace('\n', ' ') # Remove newline characters
            document_list.append(data)
    return document_list
#print(data)
# Data is located in text format, character escaped, inside the Documents folder
# TODO: This is a very short sample document to test functionality. When we confirm this works, lets use a larger document.
document_filepath_1 = 'Documents/Japanese_Earthquake-NationalGeographic.txt'
document_filepath_2 = 'Documents/Japanese_Earthquake-Britannica.txt'
documents = [document_filepath_1, document_filepath_2]
document_text_list = txt_files_to_string(documents)


#### Step 2 - Data Preprocessing

In [47]:
# LSI (Latent Sentiment Indexing): Tokenize, remove stopwords, and stem the words
def preprocess_lsi_text(document_list) -> list:
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_docs = []
    for doc in document_list:
    # Tokenizer
        tokenized_sentences = sent_tokenize(doc, language='english')  
    # LowerCase
        sentences_to_lower = [sentence.lower() for sentence in tokenized_sentences]
    # Remove Punctuation
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        sentences_no_punctuation = [regex.sub('', sentence) for sentence in sentences_to_lower]
    # Remove Stop words
        removed_stop_words = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_no_punctuation]
    # Stemming
        stemmed_words = []
        for sentences in removed_stop_words:
            for word in sentences:
                stemmed_words.append(stemmer.stem(word))
        
        processed_docs.append(stemmed_words)
    return processed_docs
    # Build Dictionary
    # Bag of wor
processed_docs = preprocess_lsi_text(document_text_list)

[<bound method Dictionary.doc2bow of <gensim.corpora.dictionary.Dictionary object at 0x000001755E4BBD50>>, <bound method Dictionary.doc2bow of <gensim.corpora.dictionary.Dictionary object at 0x000001755E4BBD50>>]


#### Step 3 - Feature Engineering

In [51]:
# LSI (Latent Sentiment Indexing): Term Frequency
# Create a dictionary mapping
dictionary = corpora.Dictionary(processed_docs)
# Bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


LsiModel<num_terms=317, num_topics=2, decay=1.0, chunksize=20000>


#### Step 4 - Algorithm and Results

In [67]:
# LSI (Latent Sentiment Indexing): Create LSI Model using Gensim
lsi_model = LsiModel(bow_corpus,num_topics=2, id2word= dictionary)
# Sort documents by weight 
lsi_text = [lsi_model[doc] for doc in bow_corpus]

# Check if lsi_text is empty before accessing its elements
    # Sort vectors by score 

    # Select top documents 

    # Sort sentence numbers in order 

    # Obtain the summary


ValueError: shapes (2,) and (0,) not aligned: 2 (dim 0) != 0 (dim 0)

#### Last Step - Evaluation

#### References
##### The following tutorials helped us implement the algorithms in the document:
##### 1. https://medium.com/data-science-in-your-pocket/text-summarization-using-textrank-in-nlp-4bce52c5b390
##### 2. https://towardsdatascience.com/document-summarization-using-latent-semantic-indexing-b747ef2d2af6 