<a href="https://colab.research.google.com/github/Celaena24/NLP/blob/main/Model2_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
paragraph = """She likes to play outside.
My favorite color is red.
Let’s go to the playground.
Does his sister have a sister?
She goes to school to study.
Let's play!
yeah"""

**TF-IDF from scratch**


Note: The formula used here is the same used by scikit-learn so that we can match results.

In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# getting the corpus ready
corpus = paragraph.split("\n")
corpus = [re.sub('[^a-zA-Z0-9]', ' ', document.lower()) for document in corpus]
corpus

['she likes to play outside ',
 'my favorite color is red ',
 'let s go to the playground ',
 'does his sister have a sister ',
 'she goes to school to study ',
 'let s play ',
 'yeah']

In [None]:
# Creating the vocabulary excluding stop words
words = []
for sent in corpus:
  for word in sent.split():
    if word not in stop_words:
      words.append(word)
vocab = set(words)
vocab

{'color',
 'favorite',
 'go',
 'goes',
 'let',
 'likes',
 'outside',
 'play',
 'playground',
 'red',
 'school',
 'sister',
 'study',
 'yeah'}

In [None]:
# Creating frequency_matrix
l = []
for document in corpus:
  doc_dict = dict.fromkeys(vocab, 0)
  for word in document.split():
    if word in vocab:
      doc_dict[word] += 1
  l.append(doc_dict)

frequency_matrix = pd.DataFrame(l, index=corpus)
frequency_matrix

Unnamed: 0,favorite,likes,red,go,sister,study,goes,outside,school,playground,yeah,let,color,play
she likes to play outside,0,1,0,0,0,0,0,1,0,0,0,0,0,1
my favorite color is red,1,0,1,0,0,0,0,0,0,0,0,0,1,0
let s go to the playground,0,0,0,1,0,0,0,0,0,1,0,1,0,0
does his sister have a sister,0,0,0,0,2,0,0,0,0,0,0,0,0,0
she goes to school to study,0,0,0,0,0,1,1,0,1,0,0,0,0,0
let s play,0,0,0,0,0,0,0,0,0,0,0,1,0,1
yeah,0,0,0,0,0,0,0,0,0,0,1,0,0,0


**Term frequency = No. of repetitions of words in a sentence / No. of words in a sentence**

In [None]:
# Creating tf (term frequency) matrix
def calculate_Tf(matrix):
  tf = matrix
  for i in range(len(corpus)):
    tf.iloc[i, :] = matrix.iloc[i, :] / matrix.sum(axis=1)[i]
  return tf

tf = calculate_Tf(frequency_matrix)
tf

Unnamed: 0,favorite,likes,red,go,sister,study,goes,outside,school,playground,yeah,let,color,play
she likes to play outside,0.0,0.333333,0.0,0.0,0,0.0,0.0,0.333333,0.0,0.0,0,0.0,0.0,0.333333
my favorite color is red,0.333333,0.0,0.333333,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.333333,0.0
let s go to the playground,0.0,0.0,0.0,0.333333,0,0.0,0.0,0.0,0.0,0.333333,0,0.333333,0.0,0.0
does his sister have a sister,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
she goes to school to study,0.0,0.0,0.0,0.0,0,0.333333,0.333333,0.0,0.333333,0.0,0,0.0,0.0,0.0
let s play,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.5,0.0,0.5
yeah,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0


**IDF = log( (1+N)/(1+n) ) + 1,**

where where N is the total number of documents (sentences) and n is the number of documents containing the word.

In [None]:
#Calculating idf (Inverse Document Frequency)
def calculate_IDF(matrix):
  N = len(corpus)
  idf_dict = {}
  for word in vocab:
    df = (matrix[word] != 0).sum()
    idf = (1+N) / (1+df)
    idf_dict[word] = np.log(idf)+1
  return pd.Series(idf_dict)

idf = calculate_IDF(frequency_matrix)
idf

favorite      2.386294
likes         2.386294
red           2.386294
go            2.386294
sister        2.386294
study         2.386294
goes          2.386294
outside       2.386294
school        2.386294
playground    2.386294
yeah          2.386294
let           1.980829
color         2.386294
play          1.980829
dtype: float64

In [None]:
#Calculating TF-IDF (Term Frequency - Inverse Document Frequency)
tf_idf = tf*idf
tf_idf

Unnamed: 0,favorite,likes,red,go,sister,study,goes,outside,school,playground,yeah,let,color,play
she likes to play outside,0.0,0.795431,0.0,0.0,0.0,0.0,0.0,0.795431,0.0,0.0,0.0,0.0,0.0,0.660276
my favorite color is red,0.795431,0.0,0.795431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.795431,0.0
let s go to the playground,0.0,0.0,0.0,0.795431,0.0,0.0,0.0,0.0,0.0,0.795431,0.0,0.660276,0.0,0.0
does his sister have a sister,0.0,0.0,0.0,0.0,2.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
she goes to school to study,0.0,0.0,0.0,0.0,0.0,0.795431,0.795431,0.0,0.795431,0.0,0.0,0.0,0.0,0.0
let s play,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.990415,0.0,0.990415
yeah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.386294,0.0,0.0,0.0


**Euclidean Normalization**

In [344]:
# Normalizing tf-idf and getting the final values
def normalize_TF_IDF(tf_idf):
  n_tf_idf = tf_idf
  for i in range(len(corpus)):
    n_tf_idf.iloc[i, :] = (n_tf_idf.iloc[i, :]) / (np.sqrt(np.sum(n_tf_idf.iloc[i, :]**2)))
  return n_tf_idf

normalize_TF_IDF(tf_idf)

Unnamed: 0,color,does,favorite,goes,let,likes,outside,play,playground,red,school,sister,study,yeah
she likes to play outside,0.0,0.0,0.0,0.0,0.0,0.609819,0.609819,0.506202,0.0,0.0,0.0,0.0,0.0,0.0
my favorite color is red,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0
let s go to the playground,0.0,0.0,0.0,0.0,0.638709,0.0,0.0,0.0,0.769449,0.0,0.0,0.0,0.0,0.0
does his sister have a sister,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.894427,0.0,0.0
she goes to school to study,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.57735,0.0
let s play,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0
yeah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


**TF-IDF using scikit**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(corpus)

In [None]:
feat_names = cv.get_feature_names_out()
feat_names

array(['color', 'does', 'favorite', 'goes', 'let', 'likes', 'outside',
       'play', 'playground', 'red', 'school', 'sister', 'study', 'yeah'],
      dtype=object)

In [None]:
cv.idf_

array([2.38629436, 2.38629436, 2.38629436, 2.38629436, 1.98082925,
       2.38629436, 2.38629436, 1.98082925, 2.38629436, 2.38629436,
       2.38629436, 2.38629436, 2.38629436, 2.38629436])

In [None]:
tf_idf = pd.DataFrame(X.toarray(), index=corpus, columns=feat_names)
tf_idf

Unnamed: 0,color,does,favorite,goes,let,likes,outside,play,playground,red,school,sister,study,yeah
she likes to play outside,0.0,0.0,0.0,0.0,0.0,0.609819,0.609819,0.506202,0.0,0.0,0.0,0.0,0.0,0.0
my favorite color is red,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0
let s go to the playground,0.0,0.0,0.0,0.0,0.638709,0.0,0.0,0.0,0.769449,0.0,0.0,0.0,0.0,0.0
does his sister have a sister,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.894427,0.0,0.0
she goes to school to study,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.57735,0.0
let s play,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0
yeah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


**FINAL RESULTS**


The results we got from implementing the algorithm from scratch turned out to be almost the same as the ones we got using the scikit learn library.

Explanation for the minor differences - the vocab had slightly different words because the stop words we used (from nltk library) were different from the stopwords used by scikit.
