In [96]:
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import string
import os
from nltk.stem import WordNetLemmatizer
from collections import Counter
import re

In [None]:
os.chdir(os.path.join('..' , 'NLP', 'data'))

In [108]:
soup = BeautifulSoup(open('positive.review', encoding = 'utf-8'))
review_texts = soup.findAll('review_text')
soup = BeautifulSoup(open('negative.review', encoding = 'utf-8'))
review_texts += soup.findAll('review_text')
print('Length of documents :', len(review_texts))

Length of documents : 2000


In [109]:
stop_words = [line for line in open('stopwords.txt', encoding='utf-8')]
len(stop_words)

427

In [110]:
stop_words = set((stopwords.words('english') + stop_words))
len(stop_words)

599

In [111]:
word_net_lemmatizer = WordNetLemmatizer()

def my_tokenize(s) :
    s = s.lower()
    s = re.sub(r"\d", "", s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [word_net_lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

word_index = {}
tokenized_texts = []
idx = 0

for review_text in review_texts :
    text_literal = review_text.text
    tokenized = my_tokenize(text_literal)
    tokenized_texts.append(tokenized)
    for word in tokenized :
        if word not in word_index :
            word_index[word] = idx
            idx += 1
            
word_index['<unknown>'] = idx
print('Length of word index mapping :', len(word_index))

Length of word index mapping : 10471


In [112]:
N = len(review_texts)
D = len(word_index)

In [113]:
#term-frequency matrix
tf = np.zeros((N, D))

for index, tokenized_text in enumerate(tokenized_texts) :
    for toknized_word in tokenized_text :
        tf[index, word_index[toknized_word]] += 1
        
tf

array([[1., 1., 2., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [114]:
#inverse document frequency
idf = np.zeros((D))
N_t = (tf > 0).sum(axis = 0) #number of documents which contain the term t
idf = np.log((N + 1) / (N_t + 1)) + 1  #smooth idf
idf

array([3.4029053 , 3.24011017, 4.63111042, ..., 7.90825515, 7.90825515,
       8.60140233])

In [115]:
tf_idf = tf * idf
tf_idf

array([[3.4029053 , 3.24011017, 9.26222084, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 6.48022034, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 7.90825515, 7.90825515,
        0.        ]])

In [116]:
#normalizing l2 norm
tf_idf = tf_idf / (np.linalg.norm(tf_idf, axis = 1, keepdims=True))
tf_idf

array([[0.08772845, 0.08353152, 0.23878427, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.24330697, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.16471291, 0.16471291,
        0.        ]])

In [117]:
#checking
np.linalg.norm(tf_idf, axis = 1, keepdims=True)[:5]

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [133]:
#show the top 5 terms (in terms of tf-idf)
def show_top_words(sentence, count=5) :
    tokenized = my_tokenize(sentence)
    term_frequency = Counter(tokenized)
    # Calculate the TF-IDF scores for the terms in the sentence
    tf_idf_scores = np.zeros((D)) # Initialize an array of size len(word_index) for TF-IDF scores
    
    # Loop through the tokenized words and calculate TF-IDF scores
    for word in tokenized :
        if word not in word_index :
            word = '<unknown>'
        term_index = word_index[word]
        tf_value = term_frequency[word]
        idf_value = idf[term_index]

        # Calculate the TF-IDF score for the term
        tf_idf_scores[term_index] = tf_value * idf_value
        
    # Get the indices of the top 'count' TF-IDF scores
    top_indices = np.argsort(tf_idf_scores)[::-1][:count]
    
    # Get the corresponding terms for the top indices
    top_words = np.array(list(word_index.keys()))[top_indices]
    return top_words

text =  '''Unleashing Gaming Power and Immersive Experience
The Acer Nitro 5 17.3" Full HD IPS 144Hz Gaming Laptop is a powerhouse that has truly exceeded my gaming expectations, earning a well-deserved 5-star rating. This laptop delivers exceptional performance, stunning visuals, and an immersive gaming experience that is second to none.

The standout feature of the Acer Nitro 5 is its powerful hardware configuration. The Intel Core i5-12500H processor combined with the NVIDIA GeForce RTX 3050 graphics card ensures smooth gameplay and seamless multitasking. Whether I'm engaged in intense gaming sessions or running resource-demanding applications, this laptop handles everything with impressive speed and efficiency.

The 17.3" Full HD IPS display with a 144Hz refresh rate is a game-changer. The visuals are incredibly crisp, and the high refresh rate guarantees smooth and fluid motion, eliminating any lag or stuttering. I am completely immersed in my games, enjoying vibrant colors and impressive detail that brings every virtual world to life.

The 512GB PCIe Gen 4 SSD provides ample storage space for my games, applications, and media files. The lightning-fast read and write speeds ensure quick load times, allowing me to jump into my favorite games without delay. The responsiveness of the storage drive enhances my overall gaming experience, keeping me in the action without interruption.

The design of the Acer Nitro 5 is sleek and stylish, with its black chassis and red accents giving it a bold and gaming-centric look. The build quality feels sturdy, and the keyboard is comfortable to use, providing an excellent tactile experience. The laptop's cooling system efficiently dissipates heat, preventing any overheating issues even during intense gaming sessions.

Connectivity options on the Acer Nitro 5 are plentiful. The laptop features an array of ports, including USB, HDMI, and Ethernet, allowing me to connect peripherals and external displays effortlessly. The inclusion of Wi-Fi 6 ensures fast and stable wireless connections, enabling smooth online gaming and lag-free multiplayer experiences.

In conclusion, the Acer Nitro 5 17.3" Full HD IPS 144Hz Gaming Laptop is a true gaming powerhouse that offers exceptional performance, stunning visuals, and immersive gameplay. With its powerful hardware, vibrant display, ample storage, and robust connectivity options, it has elevated my gaming experience to new heights. If you're seeking a high-performance gaming laptop, I highly recommend the Acer Nitro 5 as an investment that will deliver endless gaming excitement'''
print('Original Text :', text)
print('Top Terms :', show_top_words(text, count=20))

Original Text : Unleashing Gaming Power and Immersive Experience
The Acer Nitro 5 17.3" Full HD IPS 144Hz Gaming Laptop is a powerhouse that has truly exceeded my gaming expectations, earning a well-deserved 5-star rating. This laptop delivers exceptional performance, stunning visuals, and an immersive gaming experience that is second to none.

The standout feature of the Acer Nitro 5 is its powerful hardware configuration. The Intel Core i5-12500H processor combined with the NVIDIA GeForce RTX 3050 graphics card ensures smooth gameplay and seamless multitasking. Whether I'm engaged in intense gaming sessions or running resource-demanding applications, this laptop handles everything with impressive speed and efficiency.

The 17.3" Full HD IPS display with a 144Hz refresh rate is a game-changer. The visuals are incredibly crisp, and the high refresh rate guarantees smooth and fluid motion, eliminating any lag or stuttering. I am completely immersed in my games, enjoying vibrant colors a