## Prerequisites

*Imports*

In [3]:
import math

## Model Expert AO Class

In [6]:
# ======================================================================================================================
# Class - ModelExpert
# ======================================================================================================================
class ModelExpertAO:
    
    @staticmethod
    def get_TF_v2(bag_of_words, words_freq_dict):
        tf = {}
        words_count = len(bag_of_words)
        for w, frq in words_freq_dict.items():
            tf[w] = frq / float(words_count)
        return tf
    
    @staticmethod
    def get_IDF_v2(words_freq_dict_list):
        docs_count = len(words_freq_dict_list)
        idf = dict.fromkeys(words_freq_dict_list[0].keys(), 0)
        for freq_dict in words_freq_dict_list:
            for w, frq in freq_dict.items():
                if frq > 0:
                    idf[w] += 1
        idf = {w: math.log(docs_count / float(frq)) for w, frq in idf.items()}
        return idf

    @staticmethod
    def get_TF_IDF_v2(tf, idf):
        tf_idf = {w: frq * idf[w] for w, frq in tf.items()}
        return tf_idf

    @staticmethod
    def get_TF_IDF_of_sanitized_words(docs_list: list):
        # TF-IDF of sanitized words
        print(f"\n{docs_list}\n")
        all_words = set().union(*docs_list)
        print(f"\n{all_words}\n")
        words_count_in_docs = [dict.fromkeys(all_words, 0) for _ in docs_list]
        for i in range(len(docs_list)):
            for w in docs_list[i]:
                words_count_in_docs[i][w] += 1
            print(f"\n== Document-{i} ==\n{words_count_in_docs[i]}\n")
        # TF
        all_TF = [ModelExpertAO.get_TF_v2(docs_list[i], words_count_in_docs[i]) for i in range(len(docs_list))]
        print(f"\n== TF ==\n{all_TF}\n")
        # IDF
        IDF = ModelExpertAO.get_IDF_v2(words_count_in_docs)
        print(f"\n== IDF ==\n{IDF}\n")
        # TF-IDF
        all_TF_IDF = [ModelExpertAO.get_TF_IDF_v2(all_TF[i], IDF) for i in range(len(docs_list))]
        print(f"\n== TF-IDF ==\n{all_TF_IDF}\n")
        return all_TF_IDF

In [7]:
# TF-IDF of sanitized words
# Preprocessing, Bag of Words
# List of lists sanitized words
docs = [
    ['ce', 'fain', 'e', 'afara'],
    ['ce', 'e', 'asta']
]
ModelExpertAO.get_TF_IDF_of_sanitized_words(docs)


[['ce', 'fain', 'e', 'afara'], ['ce', 'e', 'asta']]


{'ce', 'asta', 'fain', 'e', 'afara'}


== Document-0 ==
{'ce': 1, 'asta': 0, 'fain': 1, 'e': 1, 'afara': 1}


== Document-1 ==
{'ce': 1, 'asta': 1, 'fain': 0, 'e': 1, 'afara': 0}


== TF ==
[{'ce': 0.25, 'asta': 0.0, 'fain': 0.25, 'e': 0.25, 'afara': 0.25}, {'ce': 0.3333333333333333, 'asta': 0.3333333333333333, 'fain': 0.0, 'e': 0.3333333333333333, 'afara': 0.0}]


== IDF ==
{'ce': 0.0, 'asta': 0.6931471805599453, 'fain': 0.6931471805599453, 'e': 0.0, 'afara': 0.6931471805599453}


== TF-IDF ==
[{'ce': 0.0, 'asta': 0.0, 'fain': 0.17328679513998632, 'e': 0.0, 'afara': 0.17328679513998632}, {'ce': 0.0, 'asta': 0.23104906018664842, 'fain': 0.0, 'e': 0.0, 'afara': 0.0}]



[{'ce': 0.0,
  'asta': 0.0,
  'fain': 0.17328679513998632,
  'e': 0.0,
  'afara': 0.17328679513998632},
 {'ce': 0.0, 'asta': 0.23104906018664842, 'fain': 0.0, 'e': 0.0, 'afara': 0.0}]