<a href="https://colab.research.google.com/github/Bateyjosue/NLP_Fellowship/blob/main/tf-idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## In-class practicals
Create a class using code above

In [4]:
import numpy as np
import math

In [9]:
class TDIDF:
    def __init__(self):
        
        self.word_to_index = {}
        self.index_to_word = {}
        self.idf = {}
        self.num_documents = 0
        

    def term_frequency(self,sentence):
        word_dict = {}
        for word in sentence.split():
          word_dict[word] = word_dict.get(word, 0) + 1
        return word_dict

    def fit(self,data):

        # Enter code here. It will involve getting the idf and the word_to_index. Returns nothing
        word_dict = {}
        global_freq = {}
        for sentence in data:
          for word in sentence.split():
            word_dict[word] = word_dict.get(word, 0) + 1

          for word in word_dict:
            global_freq[word] = global_freq.get(word, 0) + 1

        for word, freq in global_freq.items():
          self.idf[word] = math.log((1 + len(data)/(1 + freq)))
          document = list(global_freq.keys())
          for position in range(len(document)):
            word = document[position]
            self.word_to_index[word] = position
            self.index_to_word[position] = word



    def transform(self, data):
        if isinstance(data,list):
            return self._transform_document(data)
        elif isinstance(data,str):
            return self._tranform_sentence(data)

    def _transform_document(self,data):
        #used for processing multiple sentences
        sentence_arrays = []
        sentence_tf_idf = {}
        document_freq = {}

        for sent in data:
          tokens = [token for foken in sent.split()]
          word_array = np.zeros(len(self.word_to_index))
          for word in tokens:
            document_freq[word] = document_freq.get(word, 0) + 1
          
          total_words = sum(document_freq.values())
          averaged_freq = {k:(float(v)/total_words) for k,v in document_freq.items()}

          for term, tf in averaged_freq.items():
            sentence_tf_idf[term] = tf * self.idf.get(term, 0)

          for token in tokens:
            if token in self.word_of_index:
              token_index = self.word_to_index[token]
              word_array[token_index] = sentence_tf_idf[token]
          
          sentence_arrays.append(word_array)
        return np.matrix(sentence_arrays)

    def _tranform_sentence(self,data):
        # given a sentence get the average frequency and multiply it with the idf. 
        # Then place the values in the word array
        document_freq = {}
        sentence_tf_idf = {}
        word_array = np.zeros(len(self.word_to_index))
        for word in data.split():
          document_freq[word] = document_freq.get(word, 0) + 1
        
        total_words = sum(document_freq.values())
        averaged_freq = {k:(float(v)/total_words) for k,v in document_freq.items()}
        
        for term, tf in averaged_freq.items():
          sentence_tf_idf[term] = tf * self.idf.get(term, 0)

        for token in data.split():
          if token in self.word_to_index:
            token_index = self.word_to_index[token]
            word_array[token_index] = sentence_tf_idf[token]
        return word_array

    def _compute_sentence_tf_idf(self, sentence):
        """
        Computes the tf_idf for a single sentence(document).
        """
        sentence_tf_idf = {}
        # Gets the document frequency by using the helper method
        document_frequency = self.term_frequency(sentence, self.ignore_tokens, self.lower_case)
        # Gets the total number of words in sentence
        total_words = sum(document_frequency.values())
        # Find individual term frequency value averaged by total number of words.
        averaged_frequency = {k:(float(v)/total_words) for k,v in document_frequency.items()}
        
        for term, tf in averaged_frequency.items():
            # Out of vocabulary words are simply zeroed. They are going to be removed later either way.
            # Computes the tfidf for each word by using word tf times the term idf
            sentence_tf_idf[term] = tf*self.idf.get(term, 0)
        return sentence_tf_idf

In [10]:
# Testing your code
tdidf = TDIDF()
tdidf.fit(['this is a list of sentences', 'second sentence in list of sentences', 'a word for complexity'])
print(tdidf.word_to_index)
print(tdidf.idf)
print(tdidf.transform("this is a sentence with two words sentence in")) # it should pick either sentence or array of sentences

{'this': 0, 'is': 1, 'a': 2, 'list': 3, 'of': 4, 'sentences': 5, 'second': 6, 'sentence': 7, 'in': 8, 'word': 9, 'for': 10, 'complexity': 11}
{'this': 0.5596157879354227, 'is': 0.5596157879354227, 'a': 0.5596157879354227, 'list': 0.5596157879354227, 'of': 0.5596157879354227, 'sentences': 0.5596157879354227, 'second': 0.6931471805599453, 'sentence': 0.6931471805599453, 'in': 0.6931471805599453, 'word': 0.9162907318741551, 'for': 0.9162907318741551, 'complexity': 0.9162907318741551}
[0.06217953 0.06217953 0.06217953 0.         0.         0.
 0.         0.15403271 0.07701635 0.         0.         0.        ]


# Assignment


*   Compare results with sklearn package: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
*   Use the code above on the cleaned text