In [15]:
# Import required libraries
import pandas as pd
import numpy as np

In [16]:
filename = "amazon_cells_labelled.txt"

data = {"text":[], "sentiment":[]}
df = pd.DataFrame(data)
corpus = [] # corpus contains all the text data

with open(filename, 'r') as file:
    data = file.readlines()
    for line in data:
        text, sentiment = line[:-2], line[-2]
        text = text.rstrip()
        corpus.append(text)
        df.loc[len(df)] = [text, sentiment]
df.to_csv("amazon_cells_labelled.csv", index=False)

df.head()

Unnamed: 0,text,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2") # this is for normalization
# for eg-: Héllò hôw are ü? would be normalized to hello how are u?

ModuleNotFoundError: No module named 'transformers'

In [None]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)



In [None]:
# Let's see what pre_tokenize_str actually does
sample_text = "Hello, world! This is a test."
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(sample_text)

print("Original text:", sample_text)
print("\nPre-tokenization result:")
for word, offset in words_with_offsets:
    print(f"Word: '{word}' | Offset: {offset}")
    
# Extract just the words (without offsets)
new_words = [word for word, offset in words_with_offsets]
print("\nJust the words:", new_words)

In [None]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ġ']


In [None]:
vocab = ["<|endoftext|>"] + alphabet.copy() # the initial vocab contains only the ascii characters (only those present in the corpus) and the special token <|endoftext|>
# <|endoftext|> this token is unique to gpt2 tokenizer. It is used to indicate the end of a text sequence.

In [None]:
# this is required for creating the merges.
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [None]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [None]:
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

('S', 'o'): 18
('Ġ', 't'): 1160
('t', 'h'): 1045
('h', 'e'): 807
('e', 'r'): 656
('r', 'e'): 551


In [None]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('Ġ', 't') 1160


In [None]:
# a and b are the two characters to be merged
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        # the while loop merges all occurences of the pair (a, b) in the splits dictionary
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [None]:
# based on the current length, let's try to create a vocab of size 250
print(len(vocab), len(splits))

84 2409


In [None]:
vocab_size = 500
merges = {}

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [None]:
print(merges) # Ġ is used to denote a space for the gpt2 tokenizer

{('Ġ', 't'): 'Ġt', ('Ġ', 'a'): 'Ġa', ('h', 'e'): 'he', ('Ġ', 'i'): 'Ġi', ('o', 'n'): 'on', ('Ġ', 'w'): 'Ġw', ('e', 'r'): 'er', ('r', 'e'): 're', ('Ġ', 'p'): 'Ġp', ('n', 'd'): 'nd', ('Ġt', 'he'): 'Ġthe', ('Ġ', 's'): 'Ġs', ('Ġ', 'c'): 'Ġc', ('o', 'r'): 'or', ('i', 'n'): 'in', ('h', 'a'): 'ha', ('o', 'u'): 'ou', ('Ġ', 'b'): 'Ġb', ('Ġ', 'f'): 'Ġf', ('Ġ', 'm'): 'Ġm', ('Ġa', 'nd'): 'Ġand', ('l', 'e'): 'le', ('i', 't'): 'it', ('s', 'e'): 'se', ('on', 'e'): 'one', ('i', 's'): 'is', ('c', 'e'): 'ce', ('l', 'l'): 'll', ('in', 'g'): 'ing', ('Ġi', 't'): 'Ġit', ('a', 't'): 'at', ('Ġ', 'd'): 'Ġd', ('Ġi', 's'): 'Ġis', ('Ġ', 'e'): 'Ġe', ('a', 'r'): 'ar', ('o', 't'): 'ot', ('Ġt', 'o'): 'Ġto', ('Ġ', 'g'): 'Ġg', ('Ġ', 'n'): 'Ġn', ('h', 'is'): 'his', ('s', 't'): 'st', ('Ġ', 'ha'): 'Ġha', ('Ġ', 'I'): 'ĠI', ('v', 'e'): 've', ('Ġ', 'l'): 'Ġl', ('h', 'one'): 'hone', ('v', 'er'): 'ver', ('e', 'd'): 'ed', ('r', 'o'): 'ro', ('Ġ', 'o'): 'Ġo', ('Ġp', 'hone'): 'Ġphone', ('m', 'e'): 'me', ('n', 't'): 'nt', ('c', 't'

In [None]:
print(vocab)

['<|endoftext|>', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ġ', 'Ġt', 'Ġa', 'he', 'Ġi', 'on', 'Ġw', 'er', 're', 'Ġp', 'nd', 'Ġthe', 'Ġs', 'Ġc', 'or', 'in', 'ha', 'ou', 'Ġb', 'Ġf', 'Ġm', 'Ġand', 'le', 'it', 'se', 'one', 'is', 'ce', 'll', 'ing', 'Ġit', 'at', 'Ġd', 'Ġis', 'Ġe', 'ar', 'ot', 'Ġto', 'Ġg', 'Ġn', 'his', 'st', 'Ġha', 'ĠI', 've', 'Ġl', 'hone', 'ver', 'ed', 'ro', 'Ġo', 'Ġphone', 'me', 'nt', 'ct', 'ly', 'Ġre', 'Ġthis', 'as', 'Ġu', 'Ġof', 'al', 'Ġbe', 'es', 'Ġmy', 'oo', 'Ġwit', 'Ġwith', 'Ġfor', 'The', 'ad', 'Ġca', 'Ġin', 'ork', 'Ġon', 'om', 'ri', 'ble', 'ter', 'Ġy', 'Ġtha', 'et', 'Ġnot', 'very', 'reat', 'gh', 'ld', '..'

In [None]:
# this function can now return tokens for a given sentence
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [None]:
tokenize("This product has wasted all the money that i had earned")

['This',
 'Ġproduct',
 'Ġhas',
 'Ġwast',
 'ed',
 'Ġall',
 'Ġthe',
 'Ġmoney',
 'Ġthat',
 'Ġi',
 'Ġhad',
 'Ġear',
 'n',
 'ed']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer() 
tfidf_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = {}

for doc_index, doc in enumerate(corpus):
    feature_index = tfidf_matrix[doc_index, :].nonzero()[1]
    tfidf_doc_values = zip(feature_index, [tfidf_matrix[doc_index, x] for x in feature_index])
    tfidf_values[doc_index] = {feature_names[i]: value for i, value in tfidf_doc_values}

for doc_index, values in tfidf_values.items():
    print(f"Document {doc_index + 1}:")
    for word, tfidf_value in values.items():
        print(f"{word}: {tfidf_value}")
    print("\n")

ImportError: DLL load failed while importing lapack_lite: The specified module could not be found.