# Libraries

In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import contractions
nltk.download('stopwords')
nltk.download('wordnet')

from typing import List

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\htc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\htc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data

In [2]:
json_file_path = r"E:\DATA SCIENCE\NLP-Tea\Data\yelp_academic_dataset_tip.json\yelp_academic_dataset_tip.json"
df = pd.read_json(json_file_path, lines=True)

df.head()

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0


In [3]:
text_data = list(df["text"][:1000]) # First 1000 Row Only 
text_data[:5]

['Avengers time with the ladies.',
 'They have lots of good deserts and tasty cuban sandwiches',
 "It's open even when you think it isn't",
 'Very decent fried chicken',
 'Appetizers.. platter special for lunch']

In [None]:
word_tokenize

['mohamad', 'fawzy', 'jfhbf', 'dvhbfehyv']


In [180]:
def preprocessing(text: str) -> list[list[str]]:
    """
    Preprocesses a given text:
    - Lowercases text
    - Contraction Handling
    - Removes punctuation and digits
    - Removes stopwords
    - Tokenizes into words
    - Applies lemmatization or stemming

    Args:
        document (str): The raw input text

    Returns:
        List of str: Cleaned and preprocessed text

    Example:
        >>> preprocessing("It's open even when you think it isn't")
        ["'s", 'open', 'even', 'think', "n't"]
    """

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Convert Text to Lowercase (Normalization)
    text_lower = text.lower()

    # Contraction Handling
    text_lower = contractions.fix(text_lower)

    # Removing Punctuation
    text_no_punct = re.sub(r'[^a-zA-Z\s\']', '', text_lower) # \' for keep apostrophes (e.g. don't, it's)

    # 3. Tokens
    # tokens = word_tokenize(text_no_punct)
    tokens = re.split(r"\s+", text_no_punct)
    

    # 4. Stop word removal
    filtered_tokens  = [token for token in tokens if token not in stop_words]

    # 5. Lemmatization 
    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]
    

    return lemma_tokens

text_data = list(df["text"][:10]) # First 1000 Row Only
preprocessed_text = [preprocessing(text) for text in text_data]
print(preprocessed_text[:5])

[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], ['open', 'even', 'think'], ['decent', 'fried', 'chicken'], ['appetizer', 'platter', 'special', 'lunch']]


In [181]:
all_tokens =[]
for lst_tokens in preprocessed_text:
    all_tokens.extend(lst_tokens)

vocab = sorted(set(all_tokens)) # Unique Words
print(vocab)

['appetizer', 'area', 'avenger', 'best', 'boring', 'center', 'cheeseburger', 'chicken', 'chili', 'city', 'cocacolaso', 'cool', 'cuban', 'cup', 'dec', 'decent', 'decorated', 'desert', 'downtown', 'eat', 'elf', 'even', 'far', 'fried', 'game', 'good', 'great', 'kid', 'lady', 'leave', 'lindenwold', 'lot', 'lunch', 'make', 'never', 'onion', 'open', "patco's", 'pickle', 'place', 'platter', 'pm', 'probably', 'relish', 'ride', 'sandwich', 'santa', 'saturday', 'silver', 'single', 'sleigh', 'special', 'spring', 'starbucks', 'stop', 'substitute', 'taco', 'tampa', 'tasty', 'th', 'think', 'time', 'train', 'ugh', 'vanilla', 'w', 'watch']


# Label Encoder

## From Scratch

In [200]:
def LabelEncoder(vocab: list) -> dict:
    """
    Creates a label encoder that maps each unique word to a unique integer index.

    Args:
        vocab (list): A sorted list of unique words (vocabulary).

    Returns:
        dict: A dictionary mapping words to their corresponding index.
    """
    word_to_index = {token: idx for idx, token in enumerate(vocab)}
    return word_to_index


def Transform (preprocessed_text: list[list[str]], word_to_idx: dict) -> list[list[int]] :
    """
    Transforms a list of tokenized text into lists of integer-encoded words.

    Args:
        preprocessed_text (list[list[str]]): A list of lists, where each sublist contains tokens from one sentence.
        word_to_idx (dict): A dictionary mapping words to unique integer indices.

    Returns:
        list: A list of lists, where each sublist contains the integer-encoded words for a sentence.
    """
    data=[]
    for sentence in preprocessed_text :
        encoded_sentence = []
        for word in sentence:
            encoded_sentence.append(word_to_idx[word])
        data.append(encoded_sentence)
    return data


word_to_idx = LabelEncoder(vocab= vocab)
transformed_txt = Transform(preprocessed_text=preprocessed_text, word_to_idx=word_to_idx)
print(word_to_idx)
print(transformed_txt[:3])


{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, "patco's": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}
[[2, 61, 28], [31, 25, 17, 58, 12, 45], [36, 21, 60]]


## Built in

In [197]:
from sklearn.preprocessing import LabelEncoder

all_tokens =[]
for lst_tokens in preprocessed_text:
    all_tokens.extend(lst_tokens) # All Words


label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_tokens)

# Output encoded labels and the mapping
print("Encoded labels:", encoded_labels)

Encoded labels: [ 2 61 28 31 25 17 58 12 45 36 21 60 15 23  7  0 40 51 32  8 13 49  6 35
 38 43 64 10 22 47 14 59 44 37 48 50 65 46 20 16 62  5  9 62 29 30 41 33
 54 26 27 42  3 39 11 52  1 66 24 19 56 53 55  4 18 57 63 34]


In [199]:
encoded_sentences = [label_encoder.transform(sentence) for sentence in preprocessed_text]
encoded_sentences

[array([ 2, 61, 28]),
 array([31, 25, 17, 58, 12, 45]),
 array([36, 21, 60]),
 array([15, 23,  7]),
 array([ 0, 40, 51, 32]),
 array([ 8, 13, 49,  6, 35, 38, 43, 64, 10, 22]),
 array([47, 14, 59, 44, 37, 48, 50, 65, 46, 20, 16, 62,  5,  9, 62, 29, 30,
        41, 33, 54, 26, 27]),
 array([42,  3, 39, 11, 52,  1, 66, 24, 19]),
 array([56]),
 array([53, 55,  4, 18, 57, 63, 34])]

# One hot Encoding

In [282]:
def OneHotEncoder(vocab: list) -> dict :
    """
    Creates one-hot encoded vectors for each unique word in the vocabulary.

    Args:
        vocab (list): A sorted list of unique tokens.

    Returns:
        dict: A mapping from word to its one-hot encoded numpy array.
    """
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    vocab_size = len(word_to_idx)
    one_hot_dict  = {}

    for word, idx in word_to_idx.items() :
        # print(word, idx)
        vec = np.zeros(shape=vocab_size, dtype=int)
        vec[idx] = 1
        one_hot_dict[word] = vec

    return one_hot_dict


def TransformOneHot(preprocessed_text: list[list[str]], word_to_vec: dict) -> list[list[np.ndarray]]:
    """
    Transforms a list of tokenized sentences into one-hot encoded vectors.

    Args:
        preprocessed_text (list of list of str): Tokenized sentences.
        word_to_vec (dict): A mapping from word to one-hot vector.

    Returns:
        list of list of np.ndarray: One-hot encoded representation of sentences.
    """
    data = []
    for sentence in preprocessed_text:
        encoded_sentence = []
        for word in sentence:
            encoded_sentence.append(word_to_vec[word])
        data.append(encoded_sentence)
    return data




one_hot_dict  = OneHotEncoder(vocab)
transformed_txt = TransformOneHot(preprocessed_text=preprocessed_text, word_to_vec=one_hot_dict)
# print(one_hot_dict)
print(transformed_txt[0])

[array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0])]


## Built in

In [398]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from numpy import array

In [418]:
all_tokens =[]
for lst_tokens in preprocessed_text:
    all_tokens.extend(lst_tokens) # All Words
vocab = sorted(set(all_tokens)) # Unique Words

print(len(all_tokens))
print(len(vocab))
w_idx = {w:i for i,w in enumerate(vocab)}
print(w_idx)

68
67
{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, "patco's": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}


In [419]:
# Label Encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(array(vocab)) 


In [420]:
# print(integer_encoded)
# print(vocab)
# print(preprocessed_text )
word2id = dict(zip(vocab, integer_encoded))
print(word2id)
print(w_idx)

datamodel=[]
data_me=[]

for sentence in preprocessed_text:
    lmodel=[]
    lme=[]

    for w in sentence:
        lmodel.append(word2id[w])
        lme.append(w_idx[w])
    datamodel.append(lmodel)
    data_me.append(lme)
#--------------------------------------------*************----------------
data_me[0], datamodel[0]

{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, "patco's": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}
{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7

([2, 61, 28], [2, 61, 28])

In [421]:
from numpy import array, reshape

integer_encoded = integer_encoded.reshape(-1, 1)

onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

In [423]:
word2onehot = dict(zip(vocab, onehot_encoded))

data=[]
for sentence in preprocessed_text:
    vec = []
    for w in sentence:
        vec.append(word2onehot[w])
    data.append(vec)
print(data[0])

[array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]
