### CAP 6640 
### Project 1 - Extractive Summarization
### Feb 8, 2024

### Group 4
### Andres Graterol
###                   UCF ID: 4031393
### Zachary Lyons
###                   UCF ID: 4226832
### Christopher Hinkle
###                   UCF ID: 4038573
### Nicolas Leocadio
###                   UCF ID: 3791733

In [None]:
import string 
import nltk 
import re 
import numpy as np
import networkx as nx

from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, LsiModel
from gensim import corpora
from scipy import spatial

# Download necessary resources from nltk
nltk.download('punkt')
nltk.download('stopwords')

### Method 1 - TextRank

#### Step 1 - Data Collection

In [None]:
# Gather lengthy articles or a collection of documents that all relate to the same topic (i.e. documents covering an earthquake)
# TextRank: Single-document summarization

'''
    Input: File path to a text file
    Output: String of the text file
'''
def txt_file_to_string(filepath):
    with open(filepath, 'r', encoding='utf8') as file:
        data = file.read()
        data = data.replace('\n', ' ') # Remove newline characters
    return data

# Data is located in text format, character escaped, inside the Documents folder
# TODO: This is a very short sample document to test functionality. When we confirm this works, lets use a larger document.
document_filepath = 'Documents/Japanese_Earthquake-NationalGeographic.txt'
document_text = txt_file_to_string(document_filepath)
print(document_text)

#### Step 2 - Data Preprocessing

In [None]:
# TextRank: remove punctuation, tokenize, and remove stopwords

'''
    Purpose: Perform appropriate preprocessing on the text file for the TextRank algorithm
'''
def preprocess_text(text, stop_words):
    tokenized_sentences = sent_tokenize(text, language='english')
    print(tokenized_sentences)

    sentences_to_lower = [sentence.lower() for sentence in tokenized_sentences]
    print(sentences_to_lower)

    # Regular Expression to match any punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    # Remove the punctuation from the lowercase sentences
    sentences_no_punctuation = [regex.sub('', sentence) for sentence in sentences_to_lower]
    print(sentences_no_punctuation)

    data = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_no_punctuation]
    return data, tokenized_sentences

# Obtain stopwords from nltk
stop_words = set(stopwords.words('english'))
# Preprocess the text to obtain the data we will use going forward
data, tokenized_sentences = preprocess_text(document_text, stop_words)
print(data)

#### Step 3 - Feature Engineering

In [None]:
# TextRank: Word Embeddings 
 
# Grab the maximum number of words in a sentence for padding sentence embeddings
max_sentence_length = max([len(sentence) for sentence in data])

'''
    Train the Word2Vec model on the data and calculate embeddings for each word
        min_count: Ignores all words with total frequency lower than this
        vector_size: Dimensionality of the word vectors
'''
# NOTE: If output is unsatsifactory, train for longer epochs
model = Word2Vec(data, min_count=1, vector_size=1, epochs=5000)

# Grab sentence embeddings by leveraging the word embeddings and sentence tokens
sentence_embeddings = [[model.wv[word][0] for word in words] for words in data]

# Pad the sentence embeddings with 0's to ensure all sentences have the same length
sentence_embeddings = [np.pad(embedding, (0, max_sentence_length - len(embedding)), 'constant') for embedding in sentence_embeddings]

# Calculate the similarity matrix
# Instantiate a matrix of zeros with the same shape as the number of sentences
similarity_matrix = np.zeros([len(data), len(data)])

# Populate the similarity matrix with cosine similarity scores (same as 1 - cosine distance)
for i, row in enumerate(sentence_embeddings):
    for j, col in enumerate(sentence_embeddings):
        similarity_matrix[i][j] = 1 - spatial.distance.cosine(row, col)

print(similarity_matrix)


#### Step 4 - Algorithm and Results


In [None]:
# TextRank: Call nx's pagerank to get scores. 

''' 
    Get the top n sentences from pagerank scores
'''
def top_n_sentences(n, scores, tokenized_sentences):
    # Key => Sentence 
    # Value => PageRank Score
    sentence_score_dict = {sentence:scores[i] for i, sentence in enumerate(tokenized_sentences)}

    # Filter the dictionary to contain only the top n sentences
    top_sentences = dict(sorted(sentence_score_dict.items(), key=lambda item: item[1], reverse=True)[:n])

    return top_sentences

# Convert similarity matrix to an nx graph and call nx's pagerank
graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(graph)

# NOTE: Modify this variable to change the number of sentences in the summary
num_sent_to_extract = 4

extractive_summary = top_n_sentences(num_sent_to_extract, scores, tokenized_sentences)

# Iterate through the dictionary to output the summary
for sentence, score in extractive_summary.items():
    print(sentence)



#### Last Step - Evaluation

In [None]:
# NOTE: Evaluation will depend on the method used to implement extractive summarization
#       - ILP (Integer Linear Programming): We can use ROUGE-2 for evaluation
# Andres NOTE: This is the only section that I am unsure of. It would be cool to use ROUGE-2 to compare our TextRank algorithm to the bigram inspection 


### Method 2 - Latent Semantic Indexing (LSI)

#### Step 1 - Data Collection

In [None]:
# Gather lengthy articles or a collection of documents that all relate to the same topic (i.e. documents covering an earthquake)
# LSI (Latent Sentiment Indexing): Multi-document summarization
# Gather lengthy articles or a collection of documents that all relate to the same topic (i.e. documents covering an earthquake)
# TextRank: Single-document summarization

'''
    Input: File path to multiple text files
    Output: List of multiple text
'''
def txt_files_to_string(filepaths) -> list[list[str]]:
    i = 0
    document_list = []
    for file in filepaths:
        with open(file, 'r', encoding='utf8') as file:
            data = file.read()
            data = data.replace('\n', ' ') # Remove newline characters
            document_list.append(data)
    return document_list
#print(data)
# Data is located in text format, character escaped, inside the Documents folder
# TODO: This is a very short sample document to test functionality. When we confirm this works, lets use a larger document.
document_filepath_1 = 'Documents/Japanese_Earthquake-NationalGeographic.txt'
document_filepath_2 = 'Documents/Japanese_Earthquake-Britannica.txt'
documents = [document_filepath_1, document_filepath_2]
document_text_list = txt_files_to_string(documents)


#### Step 2 - Data Preprocessing

In [47]:
# LSI (Latent Sentiment Indexing): Tokenize, remove stopwords, and stem the words
def preprocess_lsi_text(document_list) -> list:
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_docs = []
    for doc in document_list:
    # Tokenizer
        tokenized_sentences = sent_tokenize(doc, language='english')  
    # LowerCase
        sentences_to_lower = [sentence.lower() for sentence in tokenized_sentences]
    # Remove Punctuation
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        sentences_no_punctuation = [regex.sub('', sentence) for sentence in sentences_to_lower]
    # Remove Stop words
        removed_stop_words = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_no_punctuation]
    # Stemming
        stemmed_words = []
        for sentences in removed_stop_words:
            for word in sentences:
                stemmed_words.append(stemmer.stem(word))
        
        processed_docs.append(stemmed_words)
    return processed_docs
    # Build Dictionary
    # Bag of wor
processed_docs = preprocess_lsi_text(document_text_list)

[<bound method Dictionary.doc2bow of <gensim.corpora.dictionary.Dictionary object at 0x000001755E4BBD50>>, <bound method Dictionary.doc2bow of <gensim.corpora.dictionary.Dictionary object at 0x000001755E4BBD50>>]


#### Step 3 - Feature Engineering

In [51]:
# LSI (Latent Sentiment Indexing): Term Frequency
# Create a dictionary mapping
dictionary = corpora.Dictionary(processed_docs)
# Bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


LsiModel<num_terms=317, num_topics=2, decay=1.0, chunksize=20000>


#### Step 4 - Algorithm and Results

In [67]:
# LSI (Latent Sentiment Indexing): Create LSI Model using Gensim
lsi_model = LsiModel(bow_corpus,num_topics=2, id2word= dictionary)
# Sort documents by weight 
lsi_text = [lsi_model[doc] for doc in bow_corpus]

# Check if lsi_text is empty before accessing its elements
    # Sort vectors by score 

    # Select top documents 

    # Sort sentence numbers in order 

    # Obtain the summary


ValueError: shapes (2,) and (0,) not aligned: 2 (dim 0) != 0 (dim 0)

#### Last Step - Evaluation

#### References
##### The following tutorials helped us implement the algorithms in the document:
##### 1. https://medium.com/data-science-in-your-pocket/text-summarization-using-textrank-in-nlp-4bce52c5b390
##### 2. https://towardsdatascience.com/document-summarization-using-latent-semantic-indexing-b747ef2d2af6 