<a href="https://colab.research.google.com/github/AngelJavierSalazar/nlp-tf-idf/blob/main/Interesting_Words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
!pip install unidecode 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [49]:
from google.colab import files
import operator
import glob, os
from collections import defaultdict

import spacy
import string
import unidecode
from collections import Counter
import pandas as pd


In [50]:
# UTILITY FUNCTIONS 

In [51]:
def tuples_to_dicts(keys, list_of_tuples):
    return [dict(zip(keys, values)) for values in list_of_tuples]

In [52]:
#REFACTORED CLASS DOCUMENT FOR GOOGLE COLAB WITH NLP PROCESSING FUNCTIONS

In [61]:
class Document:

    def __init__(self, file_path):
        # The name of the file is obtained
        self._file_name = os.path.basename(file_path)
        # Spacy features to process data in English are loaded
        nlp = spacy.load("en_core_web_sm")
        # The content of the text file is stored
        raw_text = open(file_path, 'r').read()
        # Spacy is applied to get a data structure already featurized
        spacy_raw_text = nlp(raw_text)
        # Each sentence is stored on a list
        self._sentences = [str(sentence).strip() for sentence in spacy_raw_text.sents]
        # The text is decoded to map uncommon characters to common ones, then transformed into lowercase, and finally punctuation is removed
        self._processed_text = nlp(unidecode.unidecode(raw_text).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))

    def get_most_common_words(self):
        return self._common_words_and_freq

    def determine_most_common_words(self, amount_of_words):
        # If this method is invoked more than once, the value of _common_words_and_freq is calculated only once
        if not hasattr(self, '_common_words_and_freq'):
            # The words found on the text are stored
            words = [chain.text for chain in self._processed_text if chain.pos_ == 'NOUN']
            # The frequency of each word is stored with it, as a list of tuples and the 'N' most frequent ones are taken
            self._common_words_and_freq = Counter(words).most_common(amount_of_words)
            # The list of tuples generated by Counter, is transformed to a list of dictionaries
            self._common_words_and_freq = tuples_to_dicts(['value', 'frequency'], self._common_words_and_freq)
            # Third and fourth components are added to the dictionary,
            # to store the name of the document where the word appear, and the presence of the word on each sentence, as a list of indexes
            for word_index, word in enumerate(self._common_words_and_freq):
                self._common_words_and_freq[word_index]['file'] = self._file_name
                self._common_words_and_freq[word_index]['sentences'] = []

    def assign_sentences_to_words(self):
        # If the most common words were determined and linked to the sentences where they appear,
        # these calculations are not going to be executed again
        if hasattr(self, '_common_words_and_freq') and not hasattr(self, '_processed_sentence'):
            # Spacy features used to process data in English are loaded
            nlp = spacy.load("en_core_web_sm")
            # For each sentence, it determines the existence of each of the words
            for sentence_index, sentence in enumerate(self._sentences):
                # The text is decoded to map uncommon characters to common ones, then transformed into lowercase, and finally punctuation is removed
                self._processed_sentence = nlp(unidecode.unidecode(sentence).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
                for word_index, word in enumerate(self._common_words_and_freq):
                    if word['value'] in [chain.text for chain in self._processed_sentence if chain.pos_ == 'NOUN']:
                        # If the the word exist on the sentence, sentence's location on the text is stored on the list of sentences of each word
                        self._common_words_and_freq[word_index]['sentences'].append(sentence_index)
                        

In [54]:
#REFACTORED TABLE CLASS FOR GOOGLE COLAB WITH FUNCTIONS TO MERGE WORDS ACROSS DOCS

In [62]:
class Table:

    def __init__(self, combined_document):
        self._entries = []
        # A table of entries is defined, based on a merge between the words found on the different documents
        for x_index, x in enumerate(combined_document):
            if self.word_exists_in_entries(x):
                # If the current word is already stored on the table of entries, the values of both are merged
                self.update_word_information(x)
            else:
                # If not, the new word is added to the table of entries
                self.insert_word_information(x)

    def get_entries(self):
        return self._entries

    def word_exists_in_entries(self, new_word):
        # It returns True is the word already exist on the table of entries, and False otherwise
        for stored_word in self._entries:
            if stored_word['value'] == new_word['value']:
                return True
        return False

    def insert_word_information(self, new_word):
        # It adds the new word to the table of entries
        self._entries.append({
            'value': new_word['value'],
            'frequency': new_word['frequency'],
            'files': [new_word['file']],
            'sentences_by_file': [new_word['sentences']]
        })

    def update_word_information(self, new_word):
        # It merges the new word with the one already stored on the table of entries,
        # adding the frequency of appearance, and appending the list of sentences where the word exist on the current document
        for stored_word_index, stored_word in enumerate(self._entries):
            if stored_word['value'] == new_word['value']:
                self._entries[stored_word_index]['frequency'] += new_word['frequency']
                self._entries[stored_word_index]['files'].append(new_word['file'])
                self._entries[stored_word_index]['sentences_by_file'].append(new_word['sentences'])

In [56]:
# THIS SECTION DETERMINES THE WORD FREQUENCIES

In [63]:
# Number of most common words to retrieve
words_to_retrieve = 10

In [64]:
# Location of the input files
files_path = '/content/drive/MyDrive/sample_docs/'
# All the file names with extension .txt are retrieved
os.chdir(files_path)
file_names = []
print(file_names)
for file_name in glob.glob('*.txt'):
    file_names.append(file_name)
file_names = sorted(file_names)
print(file_names)
# A list of documents is created
overall_most_common_words = []
overall_most_common_words



[]
['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt', 'doc6.txt']


[]

In [65]:
# The list is filled with Document objects
documents = [Document(files_path + file_name) for file_name in file_names]



In [66]:
# For every file, the significant words are determined, their frequency and the sentences where they appear
for document_index, document in enumerate(documents):
    print('Processing file \'' + file_names[document_index] + '\'...')
    document.assign_sentences_to_words()
    document.determine_most_common_words(words_to_retrieve)
    document.assign_sentences_to_words()
    document.determine_most_common_words(words_to_retrieve)
    # All the words retrieved from every file, are stored in one list
    overall_most_common_words += document.get_most_common_words()
# A table of entries is generated, formatted according to the requirements

Processing file 'doc1.txt'...
Processing file 'doc2.txt'...
Processing file 'doc3.txt'...
Processing file 'doc4.txt'...
Processing file 'doc5.txt'...
Processing file 'doc6.txt'...


In [67]:
table = Table(overall_most_common_words)
table_of_entries = table.get_entries()
print('These are the results of word frequencies:', table_of_entries)

These are the results of word frequencies: [{'value': 'people', 'frequency': 66, 'files': ['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt'], 'sentences_by_file': [[5, 14, 15, 22, 26, 36, 41, 46, 48, 57, 58, 113, 124], [20, 35, 44, 59, 99, 152, 174, 190, 199, 212], [3, 4, 13, 16, 18, 39, 42, 49, 52, 62, 87, 129], [9, 21, 45, 51, 62, 64, 74, 76, 80, 84, 85, 86, 90, 100, 103, 104, 105, 113, 121], [9, 11, 14, 17, 62, 97, 132, 163, 182]]}, {'value': 'today', 'frequency': 11, 'files': ['doc1.txt'], 'sentences_by_file': [[0, 6, 21, 28, 39, 48, 49, 60, 101, 127, 134]]}, {'value': 'generation', 'frequency': 11, 'files': ['doc1.txt'], 'sentences_by_file': [[38, 39, 74, 78, 82, 85, 87, 88, 92, 93, 108]]}, {'value': 'time', 'frequency': 60, 'files': ['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt', 'doc6.txt'], 'sentences_by_file': [[18, 30, 38, 39, 61, 62, 102, 103, 117, 133], [3, 27, 28, 56, 57, 93, 95, 101, 102, 109, 113, 114, 115, 129, 190, 199], [2, 3, 29, 42, 67, 116, 1

# New section

In [68]:
import pprint

In [69]:
for entries in table_of_entries:
  print(entries)


{'value': 'people', 'frequency': 66, 'files': ['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt'], 'sentences_by_file': [[5, 14, 15, 22, 26, 36, 41, 46, 48, 57, 58, 113, 124], [20, 35, 44, 59, 99, 152, 174, 190, 199, 212], [3, 4, 13, 16, 18, 39, 42, 49, 52, 62, 87, 129], [9, 21, 45, 51, 62, 64, 74, 76, 80, 84, 85, 86, 90, 100, 103, 104, 105, 113, 121], [9, 11, 14, 17, 62, 97, 132, 163, 182]]}
{'value': 'today', 'frequency': 11, 'files': ['doc1.txt'], 'sentences_by_file': [[0, 6, 21, 28, 39, 48, 49, 60, 101, 127, 134]]}
{'value': 'generation', 'frequency': 11, 'files': ['doc1.txt'], 'sentences_by_file': [[38, 39, 74, 78, 82, 85, 87, 88, 92, 93, 108]]}
{'value': 'time', 'frequency': 60, 'files': ['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt', 'doc6.txt'], 'sentences_by_file': [[18, 30, 38, 39, 61, 62, 102, 103, 117, 133], [3, 27, 28, 56, 57, 93, 95, 101, 102, 109, 113, 114, 115, 129, 190, 199], [2, 3, 29, 42, 67, 116, 160, 161, 162, 163, 164, 165, 166], [0, 72, 77, 

In [70]:
print(type(table_of_entries))

<class 'list'>


In [71]:
sorted_table_of_entries = sorted(table_of_entries, key=lambda x: x['frequency'], reverse=True)



In [72]:
df = pd.DataFrame(sorted_table_of_entries)

df.head(10)

Unnamed: 0,value,frequency,files,sentences_by_file
0,people,66,"[doc1.txt, doc2.txt, doc3.txt, doc4.txt, doc5....","[[5, 14, 15, 22, 26, 36, 41, 46, 48, 57, 58, 1..."
1,time,60,"[doc1.txt, doc2.txt, doc3.txt, doc4.txt, doc5....","[[18, 30, 38, 39, 61, 62, 102, 103, 117, 133],..."
2,country,60,"[doc1.txt, doc2.txt, doc3.txt, doc4.txt, doc5....","[[2, 33, 41, 81, 94, 112], [6, 16, 17, 18, 22,..."
3,government,39,"[doc2.txt, doc3.txt, doc4.txt, doc5.txt]","[[13, 19, 78, 79, 120, 122, 129, 177, 196], [1..."
4,war,30,"[doc1.txt, doc5.txt]","[[3, 49, 55, 98, 99, 103, 106], [12, 14, 19, 2..."
5,promise,29,[doc2.txt],"[[6, 8, 9, 21, 62, 74, 75, 76, 77, 78, 82, 83,..."
6,care,26,"[doc1.txt, doc2.txt, doc3.txt]","[[55, 83, 85, 87, 107, 131], [30, 44, 45, 51, ..."
7,corruption,20,[doc4.txt],"[[51, 56, 57, 62, 65, 67, 68, 69, 71, 73, 74, ..."
8,troops,20,[doc5.txt],"[[21, 35, 44, 72, 74, 82, 83, 91, 102, 104, 10..."
9,weapons,20,[doc6.txt],"[[1, 4, 15, 16, 18, 19, 20, 22, 23, 35, 37, 49..."
