<a href="https://colab.research.google.com/github/19K41A0503/NLP/blob/main/NLP1_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
documentA = 'Text Simplification is the task of reducing the complexity of the vocabulary and sentence structure of text while retaining its original meaning, with the goal of improving readability and understanding.'
documentB = 'Sentiment Analysis is the process of determining whether a piece of writing is positive, negative or neutral. A sentiment analysis system for text analysis combines natural language processing (NLP) and machine learning techniques to assign weighted sentiment scores to the entities, topics, themes and categories within a sentence or phrase.'

In [11]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [12]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [13]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [14]:
import nltk
nltk.download('stopwords')
#stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Term Frequency (TF)

In [15]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [16]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(tfA)
print(tfB)

{'A': 0.0, 'techniques': 0.0, 'Simplification': 0.03333333333333333, 'within': 0.0, 'is': 0.03333333333333333, 'learning': 0.0, 'scores': 0.0, '(NLP)': 0.0, 'sentence': 0.03333333333333333, 'natural': 0.0, 'writing': 0.0, 'the': 0.13333333333333333, 'topics,': 0.0, 'original': 0.03333333333333333, 'and': 0.06666666666666667, 'retaining': 0.03333333333333333, 'understanding.': 0.03333333333333333, 'while': 0.03333333333333333, 'Text': 0.03333333333333333, 'system': 0.0, 'Analysis': 0.0, 'process': 0.0, 'sentiment': 0.0, 'reducing': 0.03333333333333333, 'to': 0.0, 'a': 0.0, 'determining': 0.0, 'task': 0.03333333333333333, 'complexity': 0.03333333333333333, 'meaning,': 0.03333333333333333, 'or': 0.0, 'analysis': 0.0, 'weighted': 0.0, 'its': 0.03333333333333333, 'neutral.': 0.0, 'with': 0.03333333333333333, 'phrase.': 0.0, 'assign': 0.0, 'combines': 0.0, 'vocabulary': 0.03333333333333333, 'positive,': 0.0, 'for': 0.0, 'readability': 0.03333333333333333, 'machine': 0.0, 'processing': 0.0, '

Inverse Data Frequency (IDF)

In [17]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [18]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)

{'A': 0.6931471805599453, 'techniques': 0.6931471805599453, 'Simplification': 0.6931471805599453, 'within': 0.6931471805599453, 'is': 0.0, 'learning': 0.6931471805599453, 'scores': 0.6931471805599453, '(NLP)': 0.6931471805599453, 'sentence': 0.0, 'natural': 0.6931471805599453, 'writing': 0.6931471805599453, 'the': 0.0, 'topics,': 0.6931471805599453, 'original': 0.6931471805599453, 'and': 0.0, 'retaining': 0.6931471805599453, 'understanding.': 0.6931471805599453, 'while': 0.6931471805599453, 'Text': 0.6931471805599453, 'system': 0.6931471805599453, 'Analysis': 0.6931471805599453, 'process': 0.6931471805599453, 'sentiment': 0.6931471805599453, 'reducing': 0.6931471805599453, 'to': 0.6931471805599453, 'a': 0.6931471805599453, 'determining': 0.6931471805599453, 'task': 0.6931471805599453, 'complexity': 0.6931471805599453, 'meaning,': 0.6931471805599453, 'or': 0.6931471805599453, 'analysis': 0.6931471805599453, 'weighted': 0.6931471805599453, 'its': 0.6931471805599453, 'neutral.': 0.6931471

In [19]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [20]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
print(df)

          A  techniques  Simplification    within   is  learning    scores  \
0  0.000000    0.000000        0.023105  0.000000  0.0  0.000000  0.000000   
1  0.013863    0.013863        0.000000  0.013863  0.0  0.013863  0.013863   

      (NLP)  sentence   natural  ...     piece  structure      goal  \
0  0.000000       0.0  0.000000  ...  0.000000   0.023105  0.023105   
1  0.013863       0.0  0.013863  ...  0.013863   0.000000  0.000000   

   entities,  negative   of  Sentiment    themes   whether  text  
0   0.000000  0.000000  0.0   0.000000  0.000000  0.000000   0.0  
1   0.013863  0.013863  0.0   0.013863  0.013863  0.013863   0.0  

[2 rows x 58 columns]


In [21]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
print(df)

   analysis       and    assign  categories  combines  complexity  \
0  0.000000  0.233118  0.000000    0.000000  0.000000    0.163819   
1  0.380656  0.180560  0.126885    0.126885  0.126885    0.000000   

   determining  entities       for      goal  ...       to    topics  \
0     0.000000  0.000000  0.000000  0.163819  ...  0.00000  0.000000   
1     0.126885  0.126885  0.126885  0.000000  ...  0.25377  0.126885   

   understanding  vocabulary  weighted   whether     while      with  \
0       0.163819    0.163819  0.000000  0.000000  0.163819  0.163819   
1       0.000000    0.000000  0.126885  0.126885  0.000000  0.000000   

     within   writing  
0  0.000000  0.000000  
1  0.126885  0.126885  

[2 rows x 53 columns]




In [22]:
_model = "sentence-transformers/bert-base-nli-mean-tokens"
model = SentenceTransformer(_model)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [24]:
def isSimilar(a,b):
  threshold = 0.8
  embeddings = model.encode([a, b])
  embeddings.shape
  res = list(cosine_similarity([embeddings[0]], embeddings[1:]))
  return res[0]
print(isSimilar(documentA, documentB))

[0.6463746]
