# Реализация всего TFIDF

In [None]:
from typing import List
from math import log

class CountVectorizer:

    def __init__(self):
        self.vocabulary = dict()

    def _transform_sentence(self, sen: str) -> str:
        """Нормализация предложения

        Args:
            sen (str): предложение

        Returns:
            str: нормализованное предложение
        """
        return [token.lower() for token in sen.split()]

    def __compute_vocabular(self, all_words: List[str]) -> None:
        """Создает словарь

        Args:
            all_words (List[str]): все слова в документе/ корпусе
        """
        uniq_words = []
        for el in all_words:
            if el not in uniq_words:
                uniq_words.append(el)

        for idx, word in enumerate(uniq_words):
            self.vocabulary[word] = idx

    def fit_transform(self, corpus: List[str]) -> List[List[int]]:
        """Обучение векторайзера и преобразование всего корпаса в матрицу

        Args:
            corpus (List[str]): документ со всеми предложениями

        Returns:
            List[int]: матрица с подсчетом слов в предложении
        """
        new_corpus = [self._transform_sentence(sen) for sen in corpus]
        merge_lists = lambda main_list: [el for line in main_list for el in line]
        self.__compute_vocabular(merge_lists(new_corpus))

        matrix = [[0] * len(self.vocabulary) for _ in range(len(corpus))]
        for i, sentence in enumerate(new_corpus):
            for word in sentence:
                matrix[i][self.vocabulary[word]] += 1

        return matrix

    def transform(self, text: str) -> List[int]:
        """
        Преобразование предложения в матрицу
        Args:
            text (str): предложение

        Returns:
            List[int]: вектор с подсчетом слов в предложении
        """
        vector = [0] * len(self.vocabulary)
        new_text = self._transform_sentence(text)
        for word in new_text:
                vector[self.vocabulary[word]] += 1
        return vector

    def get_feature_names(self) -> List[str]:
        """Возвращает слова в свое словаре

        Returns:
            List[str]: список слов
        """
        return list(self.vocabulary.keys())


In [5]:
class TfIdfTransformer:
    
    def __init__(self):
        self.D = None
        self.n = None
    
    def _tf_transform(self, count_matrix):
        res_matrix = []
        for count_list in count_matrix:
            doc_terms_count = sum(count_list)
            print(doc_terms_count)
            if sum(count_list) == 0:
                res_matrix.append(count_list)
            else:
                res_matrix.append(list(map(lambda k : round(k / doc_terms_count, 3), count_list)))
        return res_matrix
    
    def _idf_transform(self, count_matrix):
    
        counter_list = [1] * self.n
        for line in count_matrix:
            for j in range(self.n):
                if line[j] > 0:
                    counter_list[j] += 1

        return [round(log(self.D / el)  + 1, 1) for el in counter_list]
    
    def fit_transform(self, count_matrix):
        self.D = len(count_matrix) + 1
        self.n = len(count_matrix[0])
        res_tf = self._tf_transform(count_matrix)
        res_idf = self._idf_transform(count_matrix)

        return [list(map(lambda x: round(x[0] * x[1], 3), zip(tf_line, res_idf))) for tf_line in res_tf]
    
class TfIdfVectorizer(CountVectorizer):

    def __init__(self):
        super().__init__()
        self._transformer = TfIdfTransformer()

    def fit_transform(self, corpus):
        return self._transformer.fit_transform((super().fit_transform(corpus)))

corpus = [
    'Crock Pot Pasta Never boil pasta again',
    'Pasta Pomodoro Fresh ingredients Parmesan to taste'
]

vectorizer = TfIdfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
tfidf_matrix

7
7


[[0.2, 0.2, 0.286, 0.2, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.143, 0.0, 0.0, 0.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]]