# TF-IDF = Term Frequency and Inverse Document Frequency

# 1)TF-IDF from Scratch

In [42]:
import numpy as np
import pandas as pd
text_data = ["good movie", "bad movie", "good bad movie"]#we removed stopwords and this is preprocessed data
 #TF - finding the term frequency  = How many times the word appeared in the sentence divide by no of words in sentence 
#IDF - Finding same word, how many times the word appear throughout the number of sentences in the whole corpus

#sentences and word Tokenization 
word_data = [sentences.split(' ') for sentences in text_data]

#Finding the vocab
vocab = list(set([word for sentence in word_data for word in sentence]))

#Find the term frequency
tf_data = []
for sentence in word_data:
    tf_sentence = []
    for word in vocab:
         tf_sentence.append(sentence.count(word)/len(sentence))
    tf_data.append(tf_sentence)
#print(tf_data)

#Finding Inverse Document Frequency
no_of_documents = len(text_data)
idf_data = []
for word in vocab:
    n_appearance = 0
    for sentences in word_data:
        if word in sentences:
            n_appearance +=1
    idf = np.log(no_of_documents/n_appearance)
    idf_data.append(idf)
#print(idf_data)
#Finding TF-IDF for each sentence
tfidf_data = []
for tf_sentence in tf_data:
    tfidf_sentence = []
    for tf, idf in zip(tf_sentence, idf_data):
        tfidf_sentence.append(tf*idf)
    tfidf_data.append(tfidf_sentence)
#print(tfidf_data)

df= pd.DataFrame(tfidf_data, columns = vocab)
df['full_sentence'] = text_data
df

Unnamed: 0,good,bad,movie,full_sentence
0,0.202733,0.0,0.0,good movie
1,0.0,0.202733,0.0,bad movie
2,0.135155,0.135155,0.0,good bad movie


# TF-IDF with Sklearn

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = ["good movie", "bad movie", "good bad movie"]

vec = TfidfVectorizer()
vec.fit(text_data)
df = pd.DataFrame(vec.transform(text_data).toarray(), columns =vec.get_feature_names_out())
df['full_sentence'] = text_data
df

Unnamed: 0,bad,good,movie,full_sentence
0,0.0,0.789807,0.613356,good movie
1,0.789807,0.0,0.613356,bad movie
2,0.619805,0.619805,0.481334,good bad movie
