# Text Mining

In this project we would explore the methods of preprocessing text which includes:
- BOW (Bag-of-Words)
- TF-IDF
- Word2Vec
- OneHotEncoding
- GloVe
- FastText

## Import Libraries

In [1]:
# Common Python Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from copy import deepcopy

# For cleaner outptu
from tqdm import tqdm

# Deep Learning Libraries
import torch
from torch.utils.data import TensorDataset, DataLoader
import keras

# Text preprocessing/cleaning Libraries
import nltk
import re
import string
import contractions
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

# Bag of Words Libraries
from sklearn.feature_extraction.text  import CountVectorizer

# TF-IDF libraries
from sklearn.feature_extraction.text import TfidfVectorizer

# Word2Vec Libraries
import gensim
from gensim.models import Word2Vec

# GloVe Libraries
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data Preprocessing Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Model Evaluation
from sklearn.metrics import f1_score, classification_report, confusion_matrix

2025-11-05 23:05:22.551284: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-05 23:05:22.552114: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-05 23:05:22.554622: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-05 23:05:22.559616: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762358722.567743   96196 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762358722.57

## GPU

In [2]:
# Detect available device
if torch.cuda.is_available():
    # check if ROCm backend is active
    if torch.version.hip is not None:
        backend = "ROCm"
    else:
        backend = "CUDA"

    device = torch.device("cuda")
    print(f"PyTorch is using GPU: {torch.cuda.get_device_name(0)}")
    print(f"Backend: {backend}")
else:
    device = torch.device("cpu")
    print("PyTorch is not using GPU — running on CPU")


PyTorch is using GPU: AMD Radeon Graphics
Backend: ROCm


In [3]:
device = torch.device("cpu") #use cpu for now

## Import the data

In [4]:
text_data_path = "./train_data.csv"
text_data = pd.read_csv(text_data_path, sep = ",")

In [5]:
# View the data
text_data.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [6]:
# text_data["label"].value_counts()

## Text Cleaning

text cleaning pipiline [source](https://stackoverflow.com/questions/48865150/pipeline-for-text-cleaning-processing-in-python)

In [7]:
def clean_text(text: str, language: str, tokenize: bool = False, remove_stop_words: bool = False, stem_words: bool = False, remove_url: bool = False):
    """
    #### Description:
    This function is to clean the text from stopwords, punctuation and return a clean text for further analysis

    Args:
        text (str):
            The dataframe containing the text data
        
        language (str):
            This are the available languages:
            - "catalan": "ca"
            - "czech": "cs"
            - "german": "de"
            - "greek": "el"mlaskjdlj
            - "english": "en"
            - "spanish": "es"
            - "finnish": "fi"
            - "french": "fr"
            - "hungarian": "hu"
            - "icelandic": "is"
            - "italian": "it"
            - "latvian": "lv"
            - "dutch": "nl"
            - "polish": "pl"
            - "portuguese": "pt"
            - "romanian": "ro"
            - "russian": "ru"
            - "slovak": "sk"
            - "slovenian": "sl"
            - "swedish": "sv"
            - "tamil": "ta"
        
        tokenize (bool):
            True = return tokenized data
            False = return untokenized data
        
        remove_stop_words (bool):
            True = remove stop words
            False = do not remove stop words

        stem_words (bool):
            True = get the base words (i.e. spraying -> spray)
            False = leave the words as is

        remove_url (bool):
            True = Remove the url in the text
            False = leave the text as is
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words(language))

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]
    
    def remove_special_characters(text):
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        text = re.sub('\s+', ' ', text)
        return text

    def stem_text(tokens):
        return [stemmer.stem(t) for t in tokens]

    def remove_stopwords_func(tokens):
        return [w for w in tokens if w not in stop_words]

    def remove_url_func(text):
        return re.sub(r'https?://\S+|www\.\S+', '', text)



    # Clean process
    text = contractions.fix(text)                        # fixing contraction
    text = text.strip().lower()                          # lowercase + trim

    if remove_url:
        text = remove_url_func(text)                     # remove url
    
    text = remove_special_characters(text)               # remove punctuation
    
    tokens = tokenize_text(text)                         # tokenize words

    if remove_stop_words:
        tokens = remove_stopwords_func(tokens)           # remove stopwords
        
    if stem_words:
        tokens = stem_text(tokens)                       # stemming

    if tokenize:
        return tokens                                    # return as tokens
    else:
        return " ".join(tokens)                          # return as string

In [8]:
# Test
sample = "I love the smell of freshly brewed coffee in the morning!"
cleaned = clean_text(sample, language="english", remove_stop_words=True, tokenize=False, remove_url=True)
print(cleaned)

love smell freshly brewed coffee morning


## Train Test Split

In [9]:
test_size = 0.20
val_size = 0.10

# Splitting the data into train and temp (which will be further split into validation and test)
train_df, test_df = train_test_split(text_data, test_size=test_size, stratify=text_data['label'], random_state=42) #stratify is used to ensure that the same proportion of each class is present in both the training and test sets

# Splitting train into validation and test sets
train_df, val_df = train_test_split(train_df, test_size=val_size, stratify=train_df['label'], random_state=42)

In [10]:
train_df.columns

Index(['text', 'label'], dtype='object')

In [11]:

# Spit the data to x and y values
x_train, y_train = train_df["text"], train_df["label"]
x_test, y_test = test_df["text"], test_df["label"]
x_val, y_val = val_df["text"], val_df["label"]

## Text Preprocessing Methods

### Bag-of-Words

Briefly, the bag-of-words preprocessing method only counts the occurence of each word and does not care about the ordering of the words.

**Example**

Both of these sentences become almost the same for BoW:

- “Coffee is life”
- “Life is coffee”

BoW doesn’t care that the word order is swapped — it just notes that both have “coffee”, “is”, and “life”.

In [12]:
bow_vectorizer = CountVectorizer(max_features=10000)  # you can change this limit
X_train_bow = bow_vectorizer.fit_transform(x_train)
X_test_bow = bow_vectorizer.transform(x_test)
X_val_bow = bow_vectorizer.transform(x_val)

In [13]:
print("Vocabulary size:", len(bow_vectorizer.get_feature_names_out()))
print("BoW shape (train):", X_train_bow.shape)
print("BoW shape (test):", X_test_bow.shape)
print("BoW shape (val):", X_val_bow.shape)

Vocabulary size: 10000
BoW shape (train): (12232, 10000)
BoW shape (test): (3398, 10000)
BoW shape (val): (1360, 10000)


### TF-IDF

TF-IDF or Term Frequency – Inverse Document Frequency.

It’s a way to represent how **important** a word is within a document, compared to the whole dataset (all documents).

1. Term Frequency (TF)
Formula:
$$
TF(t, d) = \frac{\text{Number of times term } t \text{ appears in document } d}{\text{Total number of terms in document } d}
$$

<br>

2. Inverse Document Frequency
$$
IDF(t) = \log{\frac{N}{n_t}}
$$
- $N$ = total number of documents
- $n_t$ = number of documents containing the term t

3. Combine TF-IDF
$$
\text{TF-IDF} = TF(t, d) × IDF(t)
$$

$Example:$

Let’s say you have 3 documents:
1. "coffee coffee bean taste nice"
2. "i love coffee"
3. "tea taste nice"

The word coffee appears often in doc1 and doc2, but not in doc3 → quite important!

The word taste appears in doc1 and doc3 → less special.

The word nice appears in doc1 and doc3 → also common.

In [14]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # you can change this limit
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
X_test_tfidf = tfidf_vectorizer.transform(x_test)
X_val_tfidf = tfidf_vectorizer.transform(x_val)

In [15]:
print("Vocabulary size:", len(tfidf_vectorizer.get_feature_names_out()))
print("tfidf shape (train):", X_train_tfidf.shape)
print("tfidf shape (test):", X_test_tfidf.shape)
print("tfidf shape (val):", X_val_tfidf.shape)

Vocabulary size: 10000
tfidf shape (train): (12232, 10000)
tfidf shape (test): (3398, 10000)
tfidf shape (val): (1360, 10000)


### Word2Vec

In [16]:
w2v_x_train = deepcopy(pd.DataFrame(x_train))
w2v_x_test = deepcopy(pd.DataFrame(x_test))
w2v_x_val = deepcopy(pd.DataFrame(x_val))
w2v_x_train

Unnamed: 0,text
10369,Carmakers registered the fewest new vehicles i...
5022,Credit Spread Improvement And A Quick Update O...
1542,MOGU Files Annual Report on Form 20-F for Fisc...
6774,$GMAB: Genmab announces net sales of DARZALEX ...
11076,Swiss Producer &amp; Import Prices (M/M) Jun: ...
...,...
4964,$FOMC - Fomo reports Q1 results https://t.co/...
15379,$ULH - Universal Logistics Holdings: Recent St...
5977,China’s banking regulator has asked lenders to...
6323,Quarles Says Fed Should Have Hiked Rates Befor...


In [17]:
w2v_x_train["tokenized"] = w2v_x_train["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        tokenize=True,
        remove_stop_words=True,
        remove_url=True
    )
)
w2v_x_test["tokenized"] = w2v_x_test["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        tokenize=True,
        remove_stop_words=True,
        remove_url=True
    )
)
w2v_x_val["tokenized"] = w2v_x_val["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        tokenize=True,
        remove_stop_words=True,
        remove_url=True
    )
)



In [18]:
w2v_x_train

Unnamed: 0,text,tokenized
10369,Carmakers registered the fewest new vehicles i...,"[carmakers, registered, fewest, new, vehicles,..."
5022,Credit Spread Improvement And A Quick Update O...,"[credit, spread, improvement, quick, update, a..."
1542,MOGU Files Annual Report on Form 20-F for Fisc...,"[mogu, files, annual, report, form, 20, f, fis..."
6774,$GMAB: Genmab announces net sales of DARZALEX ...,"[gmab, genmab, announces, net, sales, darzalex..."
11076,Swiss Producer &amp; Import Prices (M/M) Jun: ...,"[swiss, producer, amp, import, prices, jun, 0,..."
...,...,...
4964,$FOMC - Fomo reports Q1 results https://t.co/...,"[fomc, fomo, reports, q1, results]"
15379,$ULH - Universal Logistics Holdings: Recent St...,"[ulh, universal, logistics, holdings, recent, ..."
5977,China’s banking regulator has asked lenders to...,"[china, banking, regulator, asked, lenders, pr..."
6323,Quarles Says Fed Should Have Hiked Rates Befor...,"[quarles, says, fed, hiked, rates, taper, fini..."


In [19]:
# make the tokenized text into one list of tokens for word2vec to learn
train_tokenized_word_list = w2v_x_train["tokenized"].to_list()
test_tokenized_word_list = w2v_x_test["tokenized"].to_list()
val_tokenized_word_list = w2v_x_val["tokenized"].to_list()

In [20]:
# Train the word2vec odel
w2v_model = Word2Vec(
    sentences=train_tokenized_word_list,
    vector_size=100,
    window=5, # Max distance from target word
    min_count=2,
    sg=1, # use skip gram (0 for CBOW)
    workers=-1 #all cores to train
)

In [21]:
# Vectorize the text data after tokenizing the text
def sentence_vector(tokens, model):
    valid_words = [w for w in tokens if w in model.wv]
    if not valid_words:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[valid_words], axis=0)

In [22]:
w2v_x_train["vectorized"] = w2v_x_train["tokenized"].apply(
    lambda x: sentence_vector(x, w2v_model)
)

w2v_x_test["vectorized"] = w2v_x_test["tokenized"].apply(
    lambda x: sentence_vector(x, w2v_model)
)

w2v_x_val["vectorized"] = w2v_x_val["tokenized"].apply(
    lambda x: sentence_vector(x, w2v_model)
)

In [23]:
w2v_x_train

Unnamed: 0,text,tokenized,vectorized
10369,Carmakers registered the fewest new vehicles i...,"[carmakers, registered, fewest, new, vehicles,...","[0.0027053908, -0.00017777388, 0.0019652518, -..."
5022,Credit Spread Improvement And A Quick Update O...,"[credit, spread, improvement, quick, update, a...","[-0.00094107055, 0.0015170475, -0.0007388296, ..."
1542,MOGU Files Annual Report on Form 20-F for Fisc...,"[mogu, files, annual, report, form, 20, f, fis...","[0.0026703032, 0.0012895016, 0.0003065228, 0.0..."
6774,$GMAB: Genmab announces net sales of DARZALEX ...,"[gmab, genmab, announces, net, sales, darzalex...","[0.003535737, 0.000555583, 0.00039419509, -0.0..."
11076,Swiss Producer &amp; Import Prices (M/M) Jun: ...,"[swiss, producer, amp, import, prices, jun, 0,...","[-0.0003388215, -0.00025418773, -3.8586302e-05..."
...,...,...,...
4964,$FOMC - Fomo reports Q1 results https://t.co/...,"[fomc, fomo, reports, q1, results]","[0.0011461508, 0.0030346797, -0.0030374527, -0..."
15379,$ULH - Universal Logistics Holdings: Recent St...,"[ulh, universal, logistics, holdings, recent, ...","[0.004229268, 0.00030185367, -0.0015749421, 0...."
5977,China’s banking regulator has asked lenders to...,"[china, banking, regulator, asked, lenders, pr...","[0.00028794058, 0.00040490009, -0.0008601295, ..."
6323,Quarles Says Fed Should Have Hiked Rates Befor...,"[quarles, says, fed, hiked, rates, taper, fini...","[-0.002324093, -0.0057031424, -0.00070241006, ..."


In [24]:
# Get the vectors and make it into a 2D array
# [[a, b, c], # label 1
#  [d, e, f], # label 2
#  [g, h, i], # label 3
#  ...      ,
#  [., ., .]] # label n

X_train_vec = np.vstack(w2v_x_train["vectorized"].values)
X_test_vec  = np.vstack(w2v_x_test["vectorized"].values)
X_val_vec   = np.vstack(w2v_x_val["vectorized"].values)

In [25]:
X_train_vec

array([[ 2.70539080e-03, -1.77773880e-04,  1.96525175e-03, ...,
         7.10904482e-04,  7.72099011e-04, -1.32175651e-03],
       [-9.41070553e-04,  1.51704752e-03, -7.38829607e-04, ...,
         5.96252130e-03, -3.06868623e-03, -9.02068045e-04],
       [ 2.67030322e-03,  1.28950155e-03,  3.06522794e-04, ...,
        -2.43260889e-04, -1.66026328e-03, -2.32353155e-03],
       ...,
       [ 2.87940580e-04,  4.04900085e-04, -8.60129483e-04, ...,
        -4.57154354e-04,  3.08528263e-03,  1.28500035e-03],
       [-2.32409290e-03, -5.70314238e-03, -7.02410063e-04, ...,
         2.60295928e-03, -1.62083865e-03,  1.60730939e-04],
       [-2.91074999e-03,  1.64213008e-03,  4.05142596e-03, ...,
        -1.28726102e-03,  9.38661979e-05, -1.52059016e-03]])

### One-Hot-Encoding

In [26]:
# Create the vectorizer
vectorizer = CountVectorizer(binary=True)  # binary=True = one-hot style

# Fit and transform the sentences
x_train_ohe = vectorizer.fit_transform(x_train)
x_test_ohe = vectorizer.fit_transform(x_test)
x_val_ohe = vectorizer.fit_transform(x_val)

In [27]:
# Show result
print(vectorizer.get_feature_names_out())
print(x_train_ohe.toarray())

['00' '000' '000x' ... 'zwiyhkfxxj' 'zynx' 'zz9ugvmizx']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Pretrained Models

We will implement GloVe and FastText

### GloVe

Steps in using GloVe:
1. Preprocess and clean the text (already did it above)
    - (lowercase, remove punctuation, stopwords, etc.)

2. Tokenize the text
    - turn sentences into lists of words.

3. Build the vocabulary
    - list of all unique words in your dataset.

4. Download GloVe pretrained model
    - e.g. glove.6B.100d.txt from Stanford.
    - How to Download:
        - Linux/mac:
            
            ```curl https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip -o glove_models.zip```
        - Windows:
            
            ```wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip```

5. Load GloVe embeddings
    - read the file into a dictionary:
<br>    { "coffee": [0.1, 0.3, ...], "dog": [-0.2, 0.4, ...] }

6. Create the embedding matrix
    - match each word in your vocabulary with its GloVe vector.
Words not found in GloVe get a zero vector or random small values.

#### Clean the Text

In [28]:
# Tokenizing the text
gloVe_x_train = deepcopy(pd.DataFrame(x_train))
gloVe_x_test = deepcopy(pd.DataFrame(x_test))
gloVe_x_val = deepcopy(pd.DataFrame(x_val))

gloVe_x_train["clean_text"] = gloVe_x_train["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        remove_stop_words=True,
        remove_url=True
    )
)
gloVe_x_test["clean_text"] = gloVe_x_test["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        remove_stop_words=True,
        remove_url=True
    )
)
gloVe_x_val["clean_text"] = gloVe_x_val["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        remove_stop_words=True,
        remove_url=True
    )
)

#### Build Vocabulary (Tokenizer)

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# get the tokenized text and make it into a list
gloVe_x_train_whole_text = gloVe_x_train["clean_text"].to_list()

gloVe_tokenizer = Tokenizer(num_words=20000) #this keeps only the most frequent 20000 words, you can change the number of words limit depending on how much RAM you have.
gloVe_tokenizer.fit_on_texts(gloVe_x_train_whole_text)

print("Number of unique words in dictionary =", len(gloVe_tokenizer.word_index))
print("Dictionary is =", gloVe_tokenizer.word_index)

Number of unique words in dictionary = 19790


In [30]:
gloVe_x_train["clean_text"]

10369    carmakers registered fewest new vehicles eu si...
5022     credit spread improvement quick update amp p 5...
1542     mogu files annual report form 20 f fiscal year...
6774           gmab genmab announces net sales darzalex q2
11076    swiss producer amp import prices jun 0 3 prev ...
                               ...                        
4964                          fomc fomo reports q1 results
15379    ulh universal logistics holdings recent streng...
5977     china banking regulator asked lenders provide ...
6323       quarles says fed hiked rates taper finished bbg
6747                bmi badger meter beats 0 05 beats revs
Name: clean_text, Length: 12232, dtype: object

#### Make a Embedding Matrix

we will make an embedding matrix that would look like this:
```
[
  [0, 0, 0, ...],  # padding (index 0)
  [?, ?, ?, ...],  # word 1: "i"
  [?, ?, ?, ...],  # word 2: "love"
  [?, ?, ?, ...],  # word 3: "coffee"
]

```

Each word is represented by its own vector, taken from the pre-trained **GloVe** file  
(e.g. `glove.6B.50d.txt`, `glove.6B.100d.txt`, `glove.6B.200d.txt`, `glove.6B.300d.txt`).


note: do this after downloading glove

In [31]:
def glove_embedding_for_vocab(filepath: str, word_index: dict, embedding_dim: int):
    """
    #### Description
    embedding_for_vocab that loads pre-trained GloVe word vectors and creates an embedding matrix for the vocabulary.

    Args:
        filepath (str):
            Path to GloVe File (e.g. glove.6B.50d.txt, glove.6B.100d.txt, glove.6B.200d.txt, glove.6B.300d.txt)
        
        word_index (dict):
            The dictionary created by the tokenizer, mapping words to indices.

        embedding_dim (int):
            The dimensionality of the word vectors (e.g., 50-dimensional vectors)
    """

    vocab_size = len(word_index) + 1  # +1 for padding token (index 0)
    embedding_matrix_vocab = np.zeros((vocab_size, embedding_dim)) # prepare empty embedding matrix

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split() # take the word and vectors since in the file its like this: word [0.100, 0.200, ...]
            if word in word_index:
                idx = word_index[word] # get the index number of the word from dictionary we made :{"word": idx}
                embedding_matrix_vocab[idx] = np.array(vector, dtype=np.float32)[:embedding_dim] # pass the vectors to the index

    return embedding_matrix_vocab

In [32]:
# The glove model paths
# gloVe_path = "./glove_models/glove.6B.50d.txt"
gloVe_path = "./glove_models/glove.6B.100d.txt"
# gloVe_path = "./glove_models/glove.6B.200d.txt"
# gloVe_path = "./glove_models/glove.6B.300d.txt"

# embedding_dim
embedding_size = 100 # please change this according to the model you are using

gloVe_embedding_matrix = glove_embedding_for_vocab(
    filepath=gloVe_path,
    word_index=gloVe_tokenizer.word_index,
    embedding_dim=embedding_size
)

In [33]:
print(f"embedding matrix shape: {gloVe_embedding_matrix.shape}")
print(f"length of our vocabulary: {len(gloVe_tokenizer.word_index)}")

embedding matrix shape: (19791, 100)
length of our vocabulary: 19790


since the output shape is (36133, 100), it means:
- there are 36,132 words in the vocabulary
- plus 1 extra row at index 0, which is a zero vector used for padding
- each word vector has 100 dimensions (because we used glove.6B.100d.txt)

Overview of the matrix (example):
```
Index 0 → [0, 0, 0, ..., 0]           ← padding token
Index 1 → [0.12, -0.05, 0.30, ..., ]  ← vector for "i"
Index 1 → [0.32, -0.09, 0.10, ..., ]  ← vector for "like"
Index 2 → [0.45, -0.10, 0.19, ..., ]  ← vector for "coffee"
...
Index 36132 → [0.03, 0.11, -0.08, ...] ← vector for last word
```

#### Get the Tokenized Numeric Text

1. Create/get embedding matrix
index 0 -> [0, 0, 0, ...]        # padding
index 1 -> [0.25, -0.10, 0.45...] # "i"
index 2 -> [-0.12, 0.38, 0.07...] # "love"
index 3 -> [0.33, 0.21, -0.08...] # "coffee"

2. Take the sample text Sample Text:
    "I love coffee"

3. Tokenize the text
    
    [1, 2, 3]

4. Add padding to the text so all sentences have equal lengths **(OPTIONAL)**
    
    [1, 2, 3, 0, 0]

5. When training, the model ould lookup the embedding matrix

    Tokenized text = [1, 2, 3, 0, 0]

    embedding matrix lookup
    ```
    [[0.25, -0.10, 0.45, ...],
    [-0.12, 0.38, 0.07, ...],
    [0.33, 0.21, -0.08, ...],
    [0, 0, 0, ...],
    [0, 0, 0, ...]]
    ```

In [34]:
gloVe_x_train

Unnamed: 0,text,clean_text
10369,Carmakers registered the fewest new vehicles i...,carmakers registered fewest new vehicles eu si...
5022,Credit Spread Improvement And A Quick Update O...,credit spread improvement quick update amp p 5...
1542,MOGU Files Annual Report on Form 20-F for Fisc...,mogu files annual report form 20 f fiscal year...
6774,$GMAB: Genmab announces net sales of DARZALEX ...,gmab genmab announces net sales darzalex q2
11076,Swiss Producer &amp; Import Prices (M/M) Jun: ...,swiss producer amp import prices jun 0 3 prev ...
...,...,...
4964,$FOMC - Fomo reports Q1 results https://t.co/...,fomc fomo reports q1 results
15379,$ULH - Universal Logistics Holdings: Recent St...,ulh universal logistics holdings recent streng...
5977,China’s banking regulator has asked lenders to...,china banking regulator asked lenders provide ...
6323,Quarles Says Fed Should Have Hiked Rates Befor...,quarles says fed hiked rates taper finished bbg


In [35]:
# Tokenize the text
gloVe_x_train["tokenized_text"] = gloVe_tokenizer.texts_to_sequences(gloVe_x_train["clean_text"])
gloVe_x_test["tokenized_text"] = gloVe_tokenizer.texts_to_sequences(gloVe_x_test["clean_text"])
gloVe_x_val["tokenized_text"] = gloVe_tokenizer.texts_to_sequences(gloVe_x_val["clean_text"])

# add padding to uniformize the length of the tokenized_text
max_num_words = gloVe_x_train["clean_text"].str.len().max()

gloVe_x_train_padded = pad_sequences(
    gloVe_x_train["tokenized_text"], 
    maxlen=max_num_words, 
    padding='post' # pad zeros at the end (or "pre")
)
gloVe_x_test_padded = pad_sequences(
    gloVe_x_test["tokenized_text"], 
    maxlen=max_num_words, 
    padding='post' # pad zeros at the end (or "pre")
)
gloVe_x_val_padded = pad_sequences(
    gloVe_x_val["tokenized_text"], 
    maxlen=max_num_words, 
    padding='post' # pad zeros at the end (or "pre")
)

# Truncating the words
# use "pre" if the beginning of sentence is not important
# use "post" if the end of the sentence is not important

# gloVe_x_train_padded = pad_sequences(
#     gloVe_x_train["tokenized_text"], 
#     maxlen=100, # Cut or padd the sentences to length 100 
#     padding='post' # pad zeros at the end (or "pre")
#     truncating='post' # Cut off tokens after the length 100
# )
# gloVe_x_test_padded = pad_sequences(
#     gloVe_x_test["tokenized_text"], 
#     maxlen=100,
#     padding='post'
#     truncating='post'
# )
# gloVe_x_val_padded = pad_sequences(
#     gloVe_x_val["tokenized_text"], 
#     maxlen=100,
#     padding='post'
#     truncating='post'
# )

after padding, we would pass the padded data into a model

#### Passing the Data to a Model

we will use a simple LSTM for demonstration

##### Tensorflow LSTM

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

vocab_size = len(gloVe_tokenizer.word_index) + 1  # same vocab size as embedding matrix
embedding_dim = 100  # same as GloVe vector size (100d)
max_len = 100  # same as you used for pad_sequences

TF_GloVe_model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[gloVe_embedding_matrix],  # load your pretrained GloVe weights here
        input_length=max_len,
        trainable=False  # freeze the GloVe weights, or set True to fine-tune
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # or softmax if you have multiple classes
])

TF_GloVe_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
TF_GloVe_model.summary()

2025-11-05 23:05:36.413065: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [37]:
TF_GloVe_model.fit(gloVe_x_train_padded, y_train, epochs=2)

Epoch 1/2
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 66ms/step - accuracy: 0.0476 - loss: -1462.9939
Epoch 2/2
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 66ms/step - accuracy: 0.0481 - loss: -18537.6016


<keras.src.callbacks.history.History at 0x71bf2cb67cd0>

##### Pytorch Model

In [38]:
import torch
import torch.nn as nn

# Create a model function
class define_gloVe_model(nn.Module):
    def __init__(self, embedding_matrix : np.ndarray):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape

        # make a tensor for the embedding matrix
        torch_GloVe_embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

        self.embedding = nn.Embedding.from_pretrained(torch_GloVe_embedding_matrix, 
                                                        freeze=True #to preserve GloVe
                                                        )

        self.hidden_size = 128 # lets have 128 lstm blocks
        self.num_layers = 3 # 3 layers for now since training is on cpu
        self.lstm = nn.LSTM(input_size=embedding_dim, # since each word would be inputted one by one and embedded into a vector from the GloVe embedding, the input size would be the the same as the length of the embedding (e.g. glove_100d would have the size of 100)
                            hidden_size=self.hidden_size,
                            num_layers=self.num_layers, 
                            #dropout = 0.5 #OPTIONAL and since this is a small model this wont be use
                            batch_first=True # ensures the input is [batch, sequence legngth, embed_dim]
                            )
        
        # Add a dense layer for output (e.g., binary classification)
        self.dense = nn.Linear(self.hidden_size, 21)
        self.softmax = nn.Softmax(dim=1) # optional, if doing binary classification

    def forward(self, input_tensor): # Forward pass
            # 1️⃣ Convert tokenized_sentence [1,2,3,...,0] → embeddings
            embeds = self.embedding(input_tensor)
            batch_size = input_tensor.size(0)

            # Initialize fresh clear memory every batch
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

            # LSTM Forward pass
            out, _ = self.lstm(embeds, (h0, c0)) # out: [batch_size, seq_len, hidden_size]
            out = self.dense(out[:, -1, :]) # take the last time step and pass it to the dense layer
            self.softmax = nn.Softmax(dim=1) # optional, if doing binary classification
            return out


In [39]:
# Convert to tensor
X_train_tensor = torch.tensor(gloVe_x_train_padded, dtype=torch.long)  # tokenized sequences
y_train_tensor = torch.tensor(y_train.to_list(), dtype=torch.long)  # labels

X_test_tensor = torch.tensor(gloVe_x_test_padded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.to_list(), dtype=torch.long)

X_val_tensor = torch.tensor(gloVe_x_val_padded, dtype=torch.long)
y_val_tensor = torch.tensor(y_val.to_list(), dtype=torch.long)

# Make Data Loader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)


In [40]:
gloVe_model = define_gloVe_model(embedding_matrix=gloVe_embedding_matrix).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gloVe_model.parameters(), lr=1e-3)

# Training Loop
num_epochs = 2

for epoch in tqdm(range(num_epochs), desc="Training pytorch GloVe model"):
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = gloVe_model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")



Training pytorch GloVe model:  50%|█████     | 1/2 [00:27<00:27, 27.72s/it]

Epoch 1/2, Loss: 2.6352


Training pytorch GloVe model: 100%|██████████| 2/2 [00:55<00:00, 27.84s/it]

Epoch 2/2, Loss: 2.6177





In [41]:
with torch.no_grad():
    correct, total = 0, 0
    for batch_x, batch_y in train_loader:
        outputs = gloVe_model(batch_x.to(device))
        _, preds = torch.max(outputs, 1)
        correct += (preds.cpu() == batch_y).sum().item()
        total += batch_y.size(0)
    print("Train Accuracy:", correct / total)

Train Accuracy: 0.20863309352517986


### FastText

This pretrained embedding model represents words as combinations of subword units (like prefixes and suffixes), allowing it to understand and generate vectors even for words with typos or unseen forms.

Making a FastText model process:
1. Preprocess and clean the text (already did it above)
    - (lowercase, remove punctuation, stopwords, etc.)

2. Tokenize the text
    - turn sentences into lists of words.

3. Build the vocabulary
    - list of all unique words in your dataset.

4. Download FastText pretrained model or make your own embedding matrix

5. Load FastText embeddings per word then average it for the whole sentence
    - example:
    ```
        sentence: ["i like coffee"]
        tokenized: ["i", "like", "coffee"]
        vectors:  [[1,2,1], [3,1,2], [5,4,3]]
        average:  [3, 2.33, 2]
    ```
6. pass it to the model

In [42]:
# Tokenizing the text
fastText_x_train = deepcopy(pd.DataFrame(x_train))
fastText_x_test = deepcopy(pd.DataFrame(x_test))
fastText_x_val = deepcopy(pd.DataFrame(x_val))

fastText_x_train["clean_text"] = fastText_x_train["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        remove_stop_words=True,
        remove_url=True,
        tokenize=True
    )
)
fastText_x_test["clean_text"] = fastText_x_test["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        remove_stop_words=True,
        remove_url=True,
        tokenize=True
    )
)
fastText_x_val["clean_text"] = fastText_x_val["text"].apply(
    lambda x: clean_text(
        text=x, 
        language="english",
        remove_stop_words=True,
        remove_url=True,
        tokenize=True
    )
)

#### using your own FastText embedding

In [43]:
from gensim.models import FastText

# sentences = tokenized list of token lists (make vocab)
fastText_x_train_sentences = fastText_x_train["clean_text"].to_list()

# Train FastText model (this will create a embedding matrix)
fastText_embedding_vectors_custom = FastText(sentences=fastText_x_train_sentences,
                                vector_size=100, # each word gets a 100-dimensional embedding
                                window=5, # considers 5 words before/after each word
                                min_count=2, # ignores rare words (appearing < 2 times)
                                sg=1) # uses skip-gram (better for rare words in comparison to cbow)

# Convert each sentence to a vector
def sentence_vec_custom(tokens: list, embedding_matrix: gensim.models.fasttext.FastText)  -> np.ndarray:
    vecs = [embedding_matrix.wv[w] for w in tokens if w in embedding_matrix.wv.key_to_index] # since each word has a vector, we would average the vectors (as a summary to feed the model)
    # further explanation
        # - This checks each word - if it exists in FastText’s vocabulary, take its vector.
        # - Then it averages all word vectors in that sentence → 1 vector per sentence.
        # - If the sentence has no known words, returns a zero vector instead.
    
    return np.mean(vecs, axis=0) if vecs else np.zeros(100, dtype=np.float32) # turns the vectors into a (number_of_sentence, 100) vector

fastText_custom_x_train_vectors = np.array([sentence_vec_custom(tokens, fastText_embedding_vectors_custom) for tokens in fastText_x_train_sentences])

#### Using Pretrained FastText embedding
There are two ways to download FastText from fastText themselves or from gensim

##### Features of each model
| Model Name                        | Source              | Dim | Subword Info | Size  | Load Method  | Coverage | RAM Usage |
| --------------------------------- | ------------------- | --- | ------------- | ----- | ------------- | -------- | ---------- |
| `fasttext-wiki-news-subwords-300` (gensim.downloader.api.load) | Wiki + News         | 300 | ✅ Yes        | ~1 GB | `api.load()`  | Medium| Light|
| `cc.en.300.vec.gz` (via [FastText](https://fasttext.cc/docs/en/crawl-vectors.html))  | Common Crawl + Wiki | 300 | ✅ Yes | ~7 GB | Manual Load   | High| Heavy |


##### Option 1: FastText
To download directly without the terminal go to [FastText](https://fasttext.cc/docs/en/crawl-vectors.html) webapge.

Or Download the pretrained embedding matrix by running the following command in the terminal
```bash
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz

gunzip cc.en.300.vec.gz
```
<br>
or if you are using python

```
import fasttext
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')
```
<br>
you can replace `en` in `cc.en.300.vec.gz` to your prefered language.
<br>Example for German:

```
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz

gunzip cc.en.300.vec.gz
```

Below are the complete 157 language codes:
| Language             | Code | Language             | Code | Language           | Code |
| -------------------- | ---- | -------------------- | ---- | ------------------ | ---- |
| Afrikaans            | af   | Albanian             | sq   | Alemannic          | als  |
| Amharic              | am   | Arabic               | ar   | Aragonese          | an   |
| Armenian             | hy   | Assamese             | as   | Asturian           | ast  |
| Azerbaijani          | az   | Bashkir              | ba   | Basque             | eu   |
| Bavarian             | bar  | Belarusian           | be   | Bengali            | bn   |
| Bihari               | bh   | Bishnupriya Manipuri | bpy  | Bosnian            | bs   |
| Breton               | br   | Bulgarian            | bg   | Burmese            | my   |
| Catalan              | ca   | Cebuano              | ceb  | Central Bicolano   | bcl  |
| Chechen              | ce   | Chinese              | zh   | Chuvash            | cv   |
| Corsican             | co   | Croatian             | hr   | Czech              | cs   |
| Danish               | da   | Divehi               | dv   | Dutch              | nl   |
| Eastern Punjabi      | pa   | Egyptian Arabic      | arz  | Emilian-Romagnol   | eml  |
| English              | en   | Erzya                | myv  | Esperanto          | eo   |
| Estonian             | et   | Fiji Hindi           | hif  | Finnish            | fi   |
| French               | fr   | Galician             | gl   | Georgian           | ka   |
| German               | de   | Goan Konkani         | gom  | Greek              | el   |
| Gujarati             | gu   | Haitian              | ht   | Hebrew             | he   |
| Hill Mari            | mrj  | Hindi                | hi   | Hungarian          | hu   |
| Icelandic            | is   | Ido                  | io   | Ilokano            | ilo  |
| Indonesian           | id   | Interlingua          | ia   | Irish              | ga   |
| Italian              | it   | Japanese             | ja   | Javanese           | jv   |
| Kannada              | kn   | Kapampangan          | pam  | Kazakh             | kk   |
| Khmer                | km   | Kirghiz              | ky   | Korean             | ko   |
| Kurdish (Kurmanji)   | ku   | Kurdish (Sorani)     | ckb  | Latin              | la   |
| Latvian              | lv   | Limburgish           | li   | Lithuanian         | lt   |
| Lombard              | lmo  | Low Saxon            | nds  | Luxembourgish      | lb   |
| Macedonian           | mk   | Maithili             | mai  | Malagasy           | mg   |
| Malay                | ms   | Malayalam            | ml   | Maltese            | mt   |
| Manx                 | gv   | Marathi              | mr   | Mazandarani        | mzn  |
| Meadow Mari          | mhr  | Minangkabau          | min  | Mingrelian         | xmf  |
| Mirandese            | mwl  | Mongolian            | mn   | Nahuatl            | nah  |
| Neapolitan           | nap  | Nepali               | ne   | Newar              | new  |
| North Frisian        | frr  | Northern Sotho       | nso  | Norwegian (Bokmål) | no   |
| Norwegian (Nynorsk)  | nn   | Occitan              | oc   | Oriya              | or   |
| Ossetian             | os   | Palatinate German    | pfl  | Pashto             | ps   |
| Persian              | fa   | Piedmontese          | pms  | Polish             | pl   |
| Portuguese           | pt   | Quechua              | qu   | Romanian           | ro   |
| Romansh              | rm   | Russian              | ru   | Sakha              | sah  |
| Sanskrit             | sa   | Sardinian            | sc   | Scots              | sco  |
| Scottish Gaelic      | gd   | Serbian              | sr   | Serbo-Croatian     | sh   |
| Sicilian             | scn  | Sindhi               | sd   | Sinhalese          | si   |
| Slovak               | sk   | Slovenian            | sl   | Somali             | so   |
| Southern Azerbaijani | azb  | Spanish              | es   | Sundanese          | su   |
| Swahili              | sw   | Swedish              | sv   | Tagalog            | tl   |
| Tajik                | tg   | Tamil                | ta   | Tatar              | tt   |
| Telugu               | te   | Thai                 | th   | Tibetan            | bo   |
| Turkish              | tr   | Turkmen              | tk   | Ukrainian          | uk   |
| Upper Sorbian        | hsb  | Urdu                 | ur   | Uyghur             | ug   |
| Uzbek                | uz   | Venetian             | vec  | Vietnamese         | vi   |
| Volapük              | vo   | Walloon              | wa   | Waray              | war  |
| Welsh                | cy   | West Flemish         | vls  | West Frisian       | fy   |
| Western Punjabi      | pnb  | Yiddish              | yi   | Yoruba             | yo   |
| Zazaki               | diq  | Zeelandic            | zea  |                    |      |


NOTE: Uncomment the code if you have a huge amount of free ram maybe around > 10GB to convert the 300 long vector to 100 for fasttext

In [44]:
# # download fasttext 
# import fasttext
# import fasttext.util
# import gensim.download

# fasttext.util.download_model('en', if_exists='ignore') 
# fastText_model = fasttext.load_model('cc.en.300.bin')
# fastText_model_reduced = fasttext.util.reduce_model(fastText_model, 100)
# ft.save_model('cc.en.100.bin')


# fastText_model_path = "./fastText_models/cc.en.300.bin"
# fastText_embedding_matrix_pretrained = fasttext.load_model(fastText_model_path)

In [45]:
# Test FastText
# fastText_embedding_matrix_pretrained.get_label_id("coffee")

In [46]:
# def sentence_vec_pretrained(tokens: list, model: fasttext.FastText._FastText)  -> np.ndarray:
#     vecs = [model.get_word_vector(w) for w in tokens if isinstance(w, str) and w.strip()] # since each word has a vector, we would average the vectors (as a summary to feed the model)
#     # further explanation
#         # - This checks each word - if it exists in FastText’s vocabulary, take its vector.
#         # - Then it averages all word vectors in that sentence → 1 vector per sentence.
#         # - If the sentence has no known words, returns a zero vector instead.
    
#     return np.mean(vecs, axis=0) if vecs else np.zeros(model.get_dimension(), dtype=np.float32) # turns the vectors into a (number_of_sentence, 100) vector

# fastText_pretrained_x_train_vectors = np.array([sentence_vec_pretrained(tokens, fastText_embedding_matrix_pretrained) for tokens in fastText_x_train_sentences])

##### Option 2: Gensim

a lighter faster version

In [47]:
import os
import gensim.downloader as api
from gensim.models import KeyedVectors

fastText_pretrained_path = "./fastText_models/fasttext-wiki-news-subwords-300.vec"

if not os.path.exists(fastText_pretrained_path):
    print(f"Downloading FastText model (wiki-news-subwords-300)...")
    fastText_embedding_vectors_pretrained_gensim = api.load("fasttext-wiki-news-subwords-300")
    
    # Save as text (.vec) for easy reuse
    os.makedirs(os.path.dirname(fastText_pretrained_path), exist_ok=True)
    fastText_embedding_vectors_pretrained_gensim.save_word2vec_format(fastText_pretrained_path)
    print(f"Model saved to: {fastText_pretrained_path}")
else:
    print("Loading FastText model from local file...")
    fastText_embedding_vectors_pretrained_gensim = KeyedVectors.load_word2vec_format(fastText_pretrained_path)
    print("Model loaded successfully!")


Loading FastText model from local file...
Model loaded successfully!


In [48]:
def sentence_vec_pretrained(tokens: list, model) -> np.ndarray:
    # collect only tokens that exist in model's vocab
    vecs = [model[w] for w in tokens if w in model]

    # average their vectors (mean) or return a zero-vector if no known word
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size, dtype=np.float32)

fastText_gensim_pretrained_x_train_vectors = np.array([sentence_vec_pretrained(tokens, fastText_embedding_vectors_pretrained_gensim) for tokens in fastText_x_train_sentences])

#### Tokenize The Text

In [49]:
# Reuse the GloVe tokenizer since the training data is the same
fastText_tokenizer = gloVe_tokenizer

# Tokenize the text
fastText_x_train["tokenized_text"] = fastText_tokenizer.texts_to_sequences(fastText_x_train["clean_text"])
fastText_x_test["tokenized_text"] = fastText_tokenizer.texts_to_sequences(fastText_x_test["clean_text"])
fastText_x_val["tokenized_text"] = fastText_tokenizer.texts_to_sequences(fastText_x_val["clean_text"])

# add padding to uniformize the length of the tokenized_text
max_num_words = fastText_x_train["clean_text"].str.len().max()

fastText_x_train_padded = pad_sequences(
    fastText_x_train["tokenized_text"], 
    maxlen=max_num_words, 
    padding='post' # pad zeros at the end (or "pre")
)
fastText_x_test_padded = pad_sequences(
    fastText_x_test["tokenized_text"], 
    maxlen=max_num_words, 
    padding='post' # pad zeros at the end (or "pre")
)
fastText_x_val_padded = pad_sequences(
    fastText_x_val["tokenized_text"], 
    maxlen=max_num_words, 
    padding='post' # pad zeros at the end (or "pre")
)

#### FastText Embedding Matrix

In [50]:
fastText_embedding_vectors_pretrained_gensim.vector_size

300

In [51]:
vocab_size = len(fastText_tokenizer.word_index) + 1  # same vocab size as embedding matrix
embedding_dim = fastText_embedding_vectors_pretrained_gensim.vector_size

def fastText_embedding_for_vocab(tokenizer: keras.src.legacy.preprocessing.text.Tokenizer, fastText_embedding_vectors):
    embedding_matrix = np.zeros((vocab_size, fastText_embedding_vectors.vector_size)) # initialize empty matrix
    for word, i in tokenizer.word_index.items(): #loop trough the vocabulary built by the tokenizer
        if word in fastText_embedding_vectors.key_to_index: #check if the words are in the embedding vectors
            embedding_matrix[i] = fastText_embedding_vectors[word] # replace the zeros with values from the vectors
    
    return embedding_matrix

fastText_embedding_matrix_pretrained = fastText_embedding_for_vocab(fastText_tokenizer, fastText_embedding_vectors_pretrained_gensim)

#### Passing the Data to a Model

we will use a simple LSTM for demonstration

##### Tensorflow LSTM

In [52]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout 

vocab_size = len(gloVe_tokenizer.word_index) + 1  # same vocab size as embedding matrix
embedding_dim = fastText_embedding_vectors_pretrained_gensim.vector_size
max_len = 300 # same as the length of GloVe model previously 

TF_GloVe_model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[fastText_embedding_matrix_pretrained],  # load your pretrained FastText weights here
        input_length=max_len,
        trainable=False  # freeze the GloVe weights, or set True to fine-tune
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # or softmax if you have multiple classes
])

TF_GloVe_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
TF_GloVe_model.summary()



In [53]:
TF_GloVe_model.fit(fastText_x_train_padded, y_train, epochs=2)

Epoch 1/2
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - accuracy: 0.0509 - loss: -1381.4365
Epoch 2/2
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.0502 - loss: -17187.0664


<keras.src.callbacks.history.History at 0x71bce1370d90>

##### Pytorch Model

In [54]:
import torch
import torch.nn as nn

# Create a model function
class define_fastText_model(nn.Module):
    def __init__(self, embedding_matrix : np.ndarray):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape

        # make a tensor for the embedding matrix
        torch_fastText_embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

        self.embedding = nn.Embedding.from_pretrained(torch_fastText_embedding_matrix, 
                                                        freeze=True #to preserve the fastText
                                                        )

        self.hidden_size = 128 # lets have 128 lstm blocks
        self.num_layers = 3 # 3 layers for now since training is on cpu
        self.lstm = nn.LSTM(input_size=embedding_dim, # since each word would be inputted one by one and embedded into a vector from the fastText embedding, the input size would be the the same as the length of the embedding (e.g. glove_100d would have the size of 100)
                            hidden_size=self.hidden_size,
                            num_layers=self.num_layers, 
                            #dropout = 0.5 #OPTIONAL and since this is a small model this wont be use
                            batch_first=True # ensures the input is [batch, sequence legngth, embed_dim]
                            )
        
        # Add a dense layer for output (e.g., binary classification)
        self.dense = nn.Linear(self.hidden_size, 21)
        self.softmax = nn.Softmax(dim=1) # optional, if doing binary classification

    def forward(self, input_tensor): # Forward pass
            # 1️⃣ Convert tokenized_sentence [1,2,3,...,0] → embeddings
            embeds = self.embedding(input_tensor)
            batch_size = input_tensor.size(0)

            # Initialize fresh clear memory every batch
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

            # LSTM Forward pass
            out, _ = self.lstm(embeds, (h0, c0)) # out: [batch_size, seq_len, hidden_size]
            out = self.dense(out[:, -1, :]) # take the last time step and pass it to the dense layer
            self.softmax = nn.Softmax(dim=1) # optional, if doing binary classification
            return out


In [55]:
# Convert to tensor
X_train_tensor = torch.tensor(fastText_x_train_padded, dtype=torch.long)  # tokenized sequences
y_train_tensor = torch.tensor(y_train.to_list(), dtype=torch.long)  # labels

X_test_tensor = torch.tensor(fastText_x_test_padded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.to_list(), dtype=torch.long)

X_val_tensor = torch.tensor(fastText_x_val_padded, dtype=torch.long)
y_val_tensor = torch.tensor(y_val.to_list(), dtype=torch.long)

# Make Data Loader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)


In [56]:
fastText_model = define_fastText_model(embedding_matrix=fastText_embedding_matrix_pretrained).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(fastText_model.parameters(), lr=1e-3)

# Training Loop
num_epochs = 2

for epoch in tqdm(range(num_epochs), desc="Training pytorch GloVe model"):
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = fastText_model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")



Training pytorch GloVe model:  50%|█████     | 1/2 [00:09<00:09,  9.31s/it]

Epoch 1/2, Loss: 2.6360


Training pytorch GloVe model: 100%|██████████| 2/2 [00:19<00:00,  9.54s/it]

Epoch 2/2, Loss: 2.6185





In [59]:
with torch.no_grad():
    correct, total = 0, 0
    for batch_x, batch_y in train_loader:
        outputs = fastText_model(batch_x.to(device))
        _, preds = torch.max(outputs, 1)
        correct += (preds.cpu() == batch_y).sum().item()
        total += batch_y.size(0)
    print("Train Accuracy:", correct / total)

Train Accuracy: 0.20863309352517986
