# Sparse vectors

## Objective
Project:
1. Generate TF-IDF matrix from Mahabharat text.

    a. Section as a document

    b. paragraph as a document

    c. Calculate similarity of common words from the TFIDF matrix

2. Generate term-term matrix weighted by PMI with co-occurance criteria as

    a. section as a document

    b. paragraph as a document
    
    c. calculate similarity of common words from the PMID matrix
    


# Text description


Experiment text has been taken from [project gutenberg](https://www.gutenberg.org/ebooks/search/?query=The+Mahabharata+of+Krishna-Dwaipayana+Vyasa) 

# Text preprocessing

In [5]:
# get structured text from raw_text
import ujson
import time

for volume in range(1,5):    
    with open(f"./data/mahabharat_gutenberg_vol{volume}.txt") as fp:
        begin, section_begin = False, False
        para = ""
        sections = []
        section = None

        with open("./data/mahabharat_gutenberg.jsonl", "a+") as jsonl_fp:

            for i, line in enumerate(fp.readlines()):
                # ignore introductory sections
                if not begin and line.strip() != "THE MAHABHARATA":
                    continue

                begin = True

                # collect all sections as list of paras such that corpus is list of sections, sections is list of paragraphs
                if line.startswith("SECTION "):
                    # write previous section to file
                    if section:
                        #
                        # split section text to paragraphs.
                        paragraphs = section["text"].split("\n\n")
                        paragraphs = [para.strip().replace("\n", " ") for para in paragraphs if
                                      para and "Parva continued" not in para]

                        section["paragraphs"] = paragraphs
                        # print(f"{volume}:{section['SECTION_ID']}:{len(paragraphs)}")
                        del section["text"]
                        sections.append(section)
                        jsonl_fp.write(f"{ujson.dumps(section)}\n")
                        # break
                    section_begin = True
                    section = {"volume":volume, "SECTION_ID": line.strip(), "text": ""}
                else:
                    # add line to section text
                    if section:
                        section["text"] += line

                # ignore after footnotes
                if line.strip() == "FOOTNOTES":
                    break


In [6]:
# get documents from raw_text

def get_sections_from_text(filepath: str) -> list:
    section_wise_corpus = []
    with open(filepath) as fp:
        for line in fp.readlines():
            section = ujson.loads(line)
            section_text = "".join(section["paragraphs"])
            section_wise_corpus.append(section_text)
    return section_wise_corpus


def get_paragraphs_from_text(filepath: str) -> list:
    para_wise_corpus = []
    with open(filepath) as fp:
        for line in fp.readlines():
            section = ujson.loads(line)
            paragraph_texts = section["paragraphs"]
            para_wise_corpus.extend(paragraph_texts)
    return para_wise_corpus

def get_sentences_from_text(filepath:str)->list:
    sent_wise_corpus = []
    with open(filepath) as fp:
        for line in fp.readlines():
            section = ujson.loads(line)
            section_text = "".join(section["paragraphs"])
            section_doc = nlp(section_text)
            section_sents = [sent.lemma_ for sent in section_doc]
            sent_wise_corpus.extend(section_sents)
    return sent_wise_corpus



# TD-IDF

In [13]:
# similarity related functions
# get top 10 similar words
import numpy as np
import re

import spacy
nlp = spacy.load("en_core_web_sm")

al_regex = re.compile('[^a-zA-Z]')

class SpacyLemmaTokenizer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
    
    def __call__(self, text):
        text = text.encode('utf-8','ignore').decode("utf-8")
        lemmas =  [token.lemma_ for token in self.nlp(text) if not (token.is_stop or token.is_punct)]
        # select only nouns
        # lemmas =  [token.lemma_ for token in self.nlp(text) if token.pos_ in ["VERB"]]
        
        lemmas  = [al_regex.sub('', lemma) for lemma in lemmas]
        lemmas  = [lemma.strip() for lemma in lemmas if lemma.strip()]
        return lemmas


def get_top_similar_words(word, similarity_matrix, vectorizer, top_n=10):
    word_index = vectorizer.vocabulary_[word]
    word_similarity = similarity_matrix[word_index]
    top_n_similar_words = np.argsort(word_similarity)[::-1][:top_n+1]
    # omit the word itself
    return [vectorizer.get_feature_names_out()[i] for i in top_n_similar_words[1:]]


In [14]:
#document: section

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer(tokenizer=SpacyLemmaTokenizer())

# get document-term matrix via vectorizer
SECTION_WISE_CORPUS = get_sections_from_text("./data/mahabharat_gutenberg.jsonl")

document_term_matrix = vectorizer.fit_transform(SECTION_WISE_CORPUS)

# transpose X to get term-document matrix
term_document_matrix = document_term_matrix.T.toarray()
print(term_document_matrix.shape)

# get similarity of words
s_time= time.time()
section_doc_similarity_matrix = cosine_similarity(term_document_matrix)
print(f"cosine_similarity took:{time.time() - s_time} secs")
# print(section_doc_similarity_matrix.shape)


for word in ["krishna", "mace", "bhishma", "drona", "pandu"]:
    print(f"{word}: {get_top_similar_words(word, section_doc_similarity_matrix, vectorizer)}")


(16045, 4998)
cosine_similarity took:7.4619505405426025 secs
krishna: ['vasudeva', 'kesava', 'gobardhana', 'armsbhishma', 'valin', 'brindavana', 'languishe', 'vyahritis', 'predestiny', 'absurd']
mace: ['impetuously', 'spike', 'officiousness', 'gloved', 'uncharged', 'womenon', 'rush', 'kapittha', 'tumult', 'apple']
bhishma: ['say', 'kuru', 'o', 'pandu', 'son', 'kurus', 'great', 'repel', 'santanu', 'excellentindeed']
drona: ['pupil', 'bharadwaja', 'aswatthaman', 'prishata', 'weapon', 'aim', 'unafflicted', 'hiranyadhanus', 'defied', 'reproachingly']
pandu: ['kunti', 'dhritarashtra', 'son', 'vaisampayana', 'kuru', 'bhishma', 'kurus', 'citizen', 'slope', 'madri']


In [None]:
#document: paragraph
#TFIDF based on paragraphs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(tokenizer=SpacyLemmaTokenizer())


# get document term matrix via vectorizer
PARA_WISE_CORPUS = get_paragraphs_from_text("./data/mahabharat_gutenberg.jsonl")

document_term_matrix = vectorizer.fit_transform(PARA_WISE_CORPUS)

# transpose X to get term-document matrix
term_document_matrix = document_term_matrix.T.toarray()


# get similarity of words
s_time= time.time()
para_doc_similarity_matrix = cosine_similarity(term_document_matrix)
print(f"cosine_similarity took:{time.time() - s_time} secs")

for word in ["krishna", "mace", "bhishma", "drona", "pandu"]:
    print(f"{word}: {get_top_similar_words(word, para_doc_similarity_matrix, vectorizer)}")
# get_top_similar_words("", similarity_matrix, vectorizer)
# OOM error

In [None]:
# document: sentence

#document: paragraph
#TFIDF based on sections without lemmatization

# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# 
# vectorizer = TfidfVectorizer()
# 
# para_wise_corpus = get_sentences_from_text("/home/ankush/workplace/nlp_projects/nlp-notebooks/vector_semantics/data/mahabharat_gutenberg.jsonl")
# 
# # get document term matrix via vectorizer
# document_term_matrix = vectorizer.fit_transform(para_wise_corpus)
# 
# # transpose X to get term document matrix
# term_document_matrix = document_term_matrix.T.toarray()
# print(term_document_matrix.shape)
# print(vectorizer.get_feature_names_out())
# 
# 
# # get similarity of words
# 
# similarity_matrix = cosine_similarity(term_document_matrix)
# print(similarity_matrix.shape)
# 
# get_top_similar_words("drona", similarity_matrix, vectorizer)

# MemoryError: Unable to allocate 42.8 GiB for an array with shape (10579, 542676) and data type float64
# TODO: how to handle out of memory matrices

# PMI: Pointwise Mutual Information

In [9]:
# convert co-occurence matrix to PMI matrix
import numpy as np
import math

np.set_printoptions(formatter={'float_kind':'{:f}'.format})

def convert_coccurence_matrix_to_pmi(Xc):    
    # Step 1: Calculate co-occurrence probabilities
    total_word_occurrences = np.sum(Xc)
    print(f"total_word_occurrences:{total_word_occurrences}")
    
    co_occurrence_probs = Xc / total_word_occurrences
    #print(f"co_occurrence_probs: {co_occurrence_probs}")
    
    # Step 2: Calculate word probabilities
    # calculate word probabilities as marginal probabilities i.e sum of probabilities across all the words
    word_probs = co_occurrence_probs.sum(axis=1)    
    # print(f"word_probs: {word_probs}")
    
    # Step 3: Calculate PMI matrix
    pmi_matrix = np.zeros(Xc.shape, dtype=float)
    # print(f"Xc.shape:{Xc.shape}")
    # print(f"pmi_matrix.shape:{pmi_matrix.shape}")
    
    for i in range(Xc.shape[0]):
        for j in range(Xc.shape[1]):
            p_a = word_probs[i]
            p_b = word_probs[j]
            p_ab = co_occurrence_probs[i, j]
            if p_a > 0 and p_b > 0 and p_ab > 0:
                pmi_matrix[i][j] = math.log2(p_ab / (p_a * p_b))

    return pmi_matrix

def optimized_convert_coccurence_matrix_to_pmi(Xc):
    total_word_occurrences = np.sum(Xc)

    # Step 1: Calculate co-occurrence probabilities
    co_occurrence_probs = Xc / total_word_occurrences
    # print(f"co_occurrence_probs: {co_occurrence_probs}")

    # Step 2: Calculate word probabilities
    p_xi = co_occurrence_probs.sum(axis=1)
    
    # divide each row by the sum of the row
    co_occurrence_probs = co_occurrence_probs / p_xi

    # divide each column by the sum of the column
    co_occurrence_probs = co_occurrence_probs / p_xi.T
    # take log2 of the matrix
    co_occurrence_probs = np.log2(co_occurrence_probs)
    return co_occurrence_probs


In [10]:
#example PMI calculation of small data

from sklearn.feature_extraction.text import CountVectorizer
# import one hot encoder
from sklearn.preprocessing import OneHotEncoder
docs = ['this this this book',
        'this cat good',
        'cat good shit']



count_model = CountVectorizer(binary=True) 
document_term_matrix = count_model.fit_transform(docs)
print(count_model.get_feature_names_out())
# print(X)
term_term_matrix = (document_term_matrix.T * document_term_matrix) # this is co-occurrence matrix in sparse csr format
term_term_matrix.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(term_term_matrix.todense())
convert_coccurence_matrix_to_pmi(term_term_matrix)



['book' 'cat' 'good' 'shit' 'this']
[[0 0 0 0 1]
 [0 0 2 1 1]
 [0 2 0 1 1]
 [0 1 1 0 0]
 [1 1 1 0 0]]
total_word_occurrences:14


array([[0.000000, 0.000000, 0.000000, 0.000000, 2.222392],
       [0.000000, 0.000000, 0.807355, 0.807355, 0.222392],
       [0.000000, 0.807355, 0.000000, 0.807355, 0.222392],
       [0.000000, 0.807355, 0.807355, 0.000000, 0.000000],
       [2.222392, 0.222392, 0.222392, 0.000000, 0.000000]])

In [11]:
# Calculate word vectors using PMI wighting
#document: section

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



# print(section_wise_corpus[0])
# get document-term matrix via Count vectorizer

count_vectorizer = CountVectorizer(tokenizer=SpacyLemmaTokenizer())
SECTION_WISE_CORPUS = get_sections_from_text("./data/mahabharat_gutenberg.jsonl")

s_time = time.time()
document_term_matrix = count_vectorizer.fit_transform(SECTION_WISE_CORPUS)
print(f"fit_transform took: {time.time()- s_time}")

# get term-term matrix
term_term_matrix = (document_term_matrix.T * document_term_matrix) 
term_term_matrix.setdiag(0)

print(f"term_term_matrix.shape:{term_term_matrix.shape}")

# list all features ordered by frequency
print(count_vectorizer.get_feature_names_out())

# convert couccurence_matrix to PMI matrix
term_pmi_matrix = optimized_convert_coccurence_matrix_to_pmi(term_term_matrix)
term_pmi_matrix[term_pmi_matrix == -np.inf] = 0

# get similarity of words

section_doc_pmi_similarity_matrix = cosine_similarity(term_pmi_matrix)
# print(section_doc_pmi_similarity_matrix.shape)

for word in ["krishna", "mace", "bhishma", "drona", "pandu"]:
    print(f"{word}: {get_top_similar_words(word, section_doc_pmi_similarity_matrix, count_vectorizer)}")
# get_top_similar_words("", similarity_matrix, vectorizer)



term_term_matrix.shape:(16045, 16045)
['a' 'abandon' 'abandonest' ... 'zealously' 'zodiac' 'zone']




krishna: ['wonderful', 'vasudeva', 'draupadi', 'compose', 'arjuna', 'pandava', 'vyasa', 'pandavas', 'poet', 'aranya']
mace: ['discus', 'host', 'huge', 'shower', 'roar', 'sword', 'bhima', 'fight', 'tempestuous', 'rush']
bhishma: ['yudhishthira', 'pandu', 'kurus', 'vyasa', 'pandava', 'bharata', 'history', 'prince', 'arrow', 'drona']
drona: ['kripa', 'aswatthaman', 'arjuna', 'karna', 'kauravas', 'duryodhana', 'pandava', 'pandavas', 'chariot', 'panchala']
pandu: ['kunti', 'vaisampayana', 'pandava', 'kurus', 'hath', 'dhritarashtra', 'bhimasena', 'kuru', 'arjuna', 'vidura']


In [12]:
# Calculate word vectors using PMI wighting
#document: paragraph
import time
import ujson
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# get document-term matrix via Count vectorizer
count_vectorizer = CountVectorizer(tokenizer=SpacyLemmaTokenizer()) 
s_time = time.time()
PARA_WISE_CORPUS = get_paragraphs_from_text("./data/mahabharat_gutenberg.jsonl")
document_term_matrix = count_vectorizer.fit_transform(PARA_WISE_CORPUS)
print(f"document_matrix formation took:{time.time() - s_time} secs")

# get term-term matrix
term_term_matrix = (document_term_matrix.T * document_term_matrix)
term_term_matrix.setdiag(0)

print(f"term_term_matrix.shape:{term_term_matrix.shape}")

# list all features ordered by frequency
print(count_vectorizer.get_feature_names_out())

# convert couccurence_matrix to PMI matrix
s_time = time.time()
term_pmi_matrix = optimized_convert_coccurence_matrix_to_pmi(term_term_matrix)
term_pmi_matrix[term_pmi_matrix == -np.inf] = 0
print(f"convert_coccurence_matrix_to_pmi took:{time.time() - s_time} secs")


s_time = time.time()
para_doc_pmi_similarity_matrix = cosine_similarity(term_pmi_matrix)
print(f"cosine_similarity took:{time.time() - s_time} secs")

for word in ["krishna", "mace", "bhishma", "drona", "pandu"]:
    print(f"{word}: {get_top_similar_words(word, para_doc_pmi_similarity_matrix, count_vectorizer)}")
# get_top_similar_words("", similarity_matrix, vectorizer)



document_matrix formation took:509.12226700782776 secs
term_term_matrix.shape:(15347, 15347)
['abandon' 'abandonest' 'abandonment' ... 'zealously' 'zodiac' 'zone']




convert_coccurence_matrix_to_pmi took:4.455509424209595 secs




cosine_similarity took:16.46736788749695 secs
krishna: ['vasudeva', 'arjuna', 'pandava', 'madhava', 'draupadi', 'subhadra', 'hero', 'pandavas', 'vrishni', 'khandava']
mace: ['discus', 'going', 'swah', 'pubescen', 'abdoman', 'baffle', 'trice', 'sugriva', 'overspread', 'lxiv']
bhishma: ['kurus', 'address', 'yudhishthira', 'santanu', 'drona', 'kuru', 'arjuna', 'continue', 'bharata', 'old']
drona: ['kripa', 'arjuna', 'aswatthaman', 'karna', 'warrior', 'duryodhana', 'encounter', 'weapon', 'arrow', 'fight']
pandu: ['kunti', 'pandava', 'vaisampayana', 'arjuna', 'dhritarashtra', 'kurus', 'kuru', 'yudhishthira', 'duryodhana', 'pritha']


# References:
1. [(Chapter 6- Vector Semantics and Embeddings) in Speech and Language Processing. Daniel Jurafsky & James H. Martin.](https://web.stanford.edu/~jurafsky/slp3/6.pdf)
2. [sklearn.feature_extraction.text](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text)