In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt_tab')
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt_tab to /home/don/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
quote = 'The first step to get a bag of words vector is to split the text into words (tokens) and then reduce words to their base forms. For example, “running” will transform into “run”. This process is called stemming. We can use the NLTK Python package for it.'

In [3]:
tokens = word_tokenize(quote)
print(tokens)

['The', 'first', 'step', 'to', 'get', 'a', 'bag', 'of', 'words', 'vector', 'is', 'to', 'split', 'the', 'text', 'into', 'words', '(', 'tokens', ')', 'and', 'then', 'reduce', 'words', 'to', 'their', 'base', 'forms', '.', 'For', 'example', ',', '“', 'running', '”', 'will', 'transform', 'into', '“', 'run', '”', '.', 'This', 'process', 'is', 'called', 'stemming', '.', 'We', 'can', 'use', 'the', 'NLTK', 'Python', 'package', 'for', 'it', '.']


In [4]:
stemmer = SnowballStemmer(language = "english")
stemmed_words = list(map(lambda x: stemmer.stem(x), tokens))
for i in range(len(tokens)):
    tokens[i] = stemmer.stem(tokens[i])
print(stemmed_words==stemmed_words)

True


In [5]:
import collections
bag_of_words = collections.Counter(stemmed_words)
print(bag_of_words)

Counter({'.': 4, 'the': 3, 'to': 3, 'word': 3, 'is': 2, 'into': 2, 'for': 2, '“': 2, 'run': 2, '”': 2, 'first': 1, 'step': 1, 'get': 1, 'a': 1, 'bag': 1, 'of': 1, 'vector': 1, 'split': 1, 'text': 1, '(': 1, 'token': 1, ')': 1, 'and': 1, 'then': 1, 'reduc': 1, 'their': 1, 'base': 1, 'form': 1, 'exampl': 1, ',': 1, 'will': 1, 'transform': 1, 'this': 1, 'process': 1, 'call': 1, 'stem': 1, 'we': 1, 'can': 1, 'use': 1, 'nltk': 1, 'python': 1, 'packag': 1, 'it': 1})


In [6]:
def createMap(collections) -> list[(str,int)]:
    mapCount = {}
    for item in collections:
        if item in mapCount:
            mapCount[item] += 1
        else:
            mapCount[item] = mapCount.get(item,0) + 1
    return mapCount
print(createMap(stemmed_words)==bag_of_words)

True


In [7]:
doc_1 = "Bob"
doc_2 = "Alice"
doc_3 = "Frank"
docs_collection = [doc_1, doc_2, doc_3]

bow_collections = []
stemmed_collections = []

for doc in docs_collection:
    tokens = word_tokenize(doc)
    stemmed_tokens = list(map(lambda x: stemmer.stem(x), tokens))
    stemmed_collections.append(stemmed_tokens)

bow_collections = [collections.Counter(doc) for doc in stemmed_collections]
print(bow_collections)

[Counter({'bob': 1}), Counter({'alic': 1}), Counter({'frank': 1})]


In [8]:
def createSegment(segTemp, newDic):
    for key, _ in newDic.items():
        segTemp[key] = segTemp.get(key,0)
    return segTemp

segTemp = {}
for bow in bow_collections:
    segTemp = createSegment(segTemp, bow)
print(segTemp)

{'bob': 0, 'alic': 0, 'frank': 0}


In [9]:
simple_vec = segTemp.copy()

count = 0
vec_collection = []
for bow in bow_collections:
    simple_vec = segTemp.copy()
    for key, value in bow.items():
        simple_vec[key] = value
    print("Vec " ,count, ": ", simple_vec, "\n")
    count +=1
    vec_collection.append(simple_vec)
print(vec_collection)



Vec  0 :  {'bob': 1, 'alic': 0, 'frank': 0} 

Vec  1 :  {'bob': 0, 'alic': 1, 'frank': 0} 

Vec  2 :  {'bob': 0, 'alic': 0, 'frank': 1} 

[{'bob': 1, 'alic': 0, 'frank': 0}, {'bob': 0, 'alic': 1, 'frank': 0}, {'bob': 0, 'alic': 0, 'frank': 1}]


In [10]:
import math

def tokenizeStemBow(docs_collection):
    stemmed_collections = []
    
    for doc in docs_collection:
        tokens = word_tokenize(doc)
        stemmed_tokens = list(map(lambda x: stemmer.stem(x), tokens))
        stemmed_collections.append(stemmed_tokens)
    
    bow_collections = [collections.Counter(doc) for doc in stemmed_collections]
    return bow_collections

def createSegment(segTemp, newDic):
    for dic in newDic:
        for key, _ in dic.items():
            if key not in segTemp:
                segTemp[key] = 0
    return segTemp

    
def createEmbedding(bow_collection, collection):
    vec_collection = []
    for bow in bow_collection:
        simple_vec = collection.copy()
        for key, value in bow.items():
            simple_vec[key] = value
        vec_collection.append(simple_vec)
    return vec_collection

def computeTF(collection) -> list[dict[str,float]]:
    col = [doc.copy() for doc in collection]
    for dic in col:
        n = 0
        for key, value in dic.items():
            n += value
            
        for key, value in dic.items():
            if dic[key] > 0 and n > 0:
                dic[key] = value / n
    return col

def computeIDF(collection) -> list[dict[str,float]]:
    N = len(collection)
    sample = collection[0].copy()
    for key, _ in sample.items():
        sample[key] = 0
        
    for key, _ in sample.items():
        for i in range(N):
            if collection[i][key] > 0:
                sample[key] += 1
            
    for key, value in sample.items():
        sample[key] = math.log(N / value)
    return sample

def computeTFIDF(collection) -> list[dict[str,float]]:
    tf_list = computeTF(collection)
    idf_list = computeIDF(collection)
    for key, _ in collection[0].items():
        for i in range(len(collection)):
            collection[i][key]= tf_list[i][key] * idf_list[key]
            #print("collection ", i, ", key ", key, ": ", tf_list[i][key], " * ", idf_list[key], " = ", collection[i][key])
    #print("\n")
    return collection

def computeWeights(weight: str, collection):
    if weight == "tf":
        return computeTF(collection)
    elif weight == "idf":
        return computeIDF(collection)
    elif weight == "tfidf":
        return computeTFIDF(collection)
    else:
        return None



In [11]:
def tokenizeSegmentEmbedding(bow_collection):
    collection = {}
    bow_collection = tokenizeStemBow(bow_collection)
    collection = createSegment({},bow_collection)
    collection = createEmbedding(bow_collection,collection)
    return collection

def computeRelevance(rel: str, query: list[dict[str,int]], collection: list[dict[str, float]])-> list[dict[str,float]]:
    if rel == "sow":
        return sumOfWeights(query, collection)
    else:
        return None
        
def sumOfWeights(query, collection):
    relevance = []
    for doc in collection:
        rel_score = 0
        for key, value in query[0].items():
            rel_score += value * doc[key]
        relevance.append(rel_score)
    return relevance

def computeScore(rel, query, collection):
    queryBow = [query]
    collection = queryBow + collection
    queryBow = tokenizeSegmentEmbedding(queryBow)
    collection = tokenizeSegmentEmbedding(collection)
    collectionWeights = computeWeights("tfidf", collection)
    relevance_score = computeRelevance(rel, queryBow, collectionWeights)
    return relevance_score

relevance_score = computeScore("sow", "Who is batman", ["My name is Bob", "My name is Alice", "I am Batman"])
print(relevance_score)

[0.7890412047105388, 0.07192051811294521, 0.07192051811294521, 0.23104906018664842]


In [17]:
query = "Who likes football?"
text1 = "My name is alice and i like ping pong."
text2 = "My name is franc and i like swimming."
text3 = "My name is bob and i like football"
collection = [text1, text2, text3]
relevance_score = computeScore("sow", query, collection)
for i in range(len(collection)):
    print("Relevance score for doc", i+1, " : ", relevance_score[i+1])

Relevance score for doc 1  :  0.0
Relevance score for doc 2  :  0.0
Relevance score for doc 3  :  0.08664339756999316
