<a href="https://colab.research.google.com/github/13rinda/crimsoneducation_nlp/blob/main/NLP_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [109]:
# ---------------------------------------------------------------
# NOTE: Since we are manually uploading the dataset to Google Colab,
# we do NOT need to download it from Kaggle using kagglehub.
#
# Instead, we will:
# 1. Upload the dataset file from our computer.
# 2. Read it directly in Colab using pandas.
#
# This is simpler for beginners and avoids installing extra libraries.
# ---------------------------------------------------------------

from google.colab import files
import pandas as pd

# Upload the dataset CSV file from your computer
uploaded = files.upload()  # This will show an "Upload" button

# Read the uploaded CSV file into a pandas DataFrame
# Replace 'emotions.csv' with the name of the file you uploaded
data = pd.read_csv('emotions.csv')

# Preview the first few rows of the dataset
print(data.head())


KeyboardInterrupt: 

In [110]:
# ---------------------------------------------------------------
# NOTE: This cell shows some setup and useful tips when working
# in a Kaggle Notebook environment.
#
# 1. Importing important libraries:
#    - numpy: used for math operations and working with numbers in arrays
#    - pandas: used to read, write, and process data, like CSV files
# 2. Listing all files in the input directory, so we know what data is available
# 3. Notes about where we can save our files in the Kaggle environment
# ---------------------------------------------------------------

import numpy as np  # Library for math operations and arrays
import pandas as pd # Library for working with datasets (CSV, Excel, etc.)

import os  # Library to interact with the file system (folders and files)

# Walk through all folders and files in the Kaggle input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # Print the full path of each file so we know where the data is
        print(os.path.join(dirname, filename))

# ---------------- Beginner Notes ----------------
# - In Kaggle, input files (datasets) are read-only and usually stored under /kaggle/input/
# - You can write temporary files under /kaggle/temp/ (will be deleted after session ends)
# - You can write permanent output files under /kaggle/working/ (up to 20GB)
# - On Google Colab, you don’t have /kaggle/input/, so you’ll need to upload files manually


/kaggle/input/emotions-dataset-for-nlp/val.txt
/kaggle/input/emotions-dataset-for-nlp/test.txt
/kaggle/input/emotions-dataset-for-nlp/train.txt


In [5]:
# ---------------------------------------------------------------
# This cell helps make the notebook easier to use and less messy.
# ---------------------------------------------------------------

import warnings  # Library to control warning messages

# Ignore warning messages so they don't clutter the output
warnings.filterwarnings('ignore')

# This line improves the auto-completion feature in notebooks
# %config is a special "magic command" in Jupyter/Kaggle/Colab notebooks
# Completer.use_jedi = False can help fix the tab-completion if it stops working
# Tab-completion: type part of a function or variable name and press Tab to see suggestions
%config Completer.use_jedi = False


In [6]:
# ---------------------------------------------------------------
# STEP: Importing the dataset files into our notebook
# ---------------------------------------------------------------

import pandas as pd  # pandas library is used for working with tables of data

# ----------------- Training Data -----------------
# Read the training dataset from a CSV (text) file
# - '/content/sample_data/train.txt' is the path to the uploaded file
# - header=None means the file has no header row, so pandas won’t treat the first row as column names
# - sep=';' means the values in the file are separated by a semicolon
# - names=['Input','Sentiment'] assigns column names to the data
# - encoding='utf-8' ensures characters like emojis or special letters are read correctly
df_train = pd.read_csv('/content/sample_data/train.txt',
                       header=None,
                       sep=';',
                       names=['Input','Sentiment'],
                       encoding='utf-8')

# ----------------- Test Data -----------------
df_test = pd.read_csv('/content/sample_data/test.txt',
                      header=None,
                      sep=';',
                      names=['Input','Sentiment'],
                      encoding='utf-8')

# ----------------- Validation Data -----------------
df_val = pd.read_csv('/content/sample_data/val.txt',
                     header=None,
                     sep=';',
                     names=['Input','Sentiment'],
                     encoding='utf-8')

# ----------------- Quick Check -----------------
# This is optional but useful for beginners to see the first few rows of the training data
print(df_train.head())


In [7]:
# ---------------------------------------------------------------
# STEP: Combine all datasets into one big dataset
# ---------------------------------------------------------------

# pd.concat() merges multiple DataFrames (tables) into one
# - [df_train, df_test, df_val] is the list of DataFrames we want to combine
# - axis=0 means we are stacking them **vertically** (rows are added one after another)
df_full = pd.concat([df_train, df_test, df_val], axis=0)

# Display the combined dataset
df_full


Unnamed: 0,Input,Sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
1995,im having ssa examination tomorrow in the morn...,sadness
1996,i constantly worry about their fight against n...,joy
1997,i feel its important to share this info for th...,joy
1998,i truly feel that if you are passionate enough...,joy


In [8]:
# ---------------------------------------------------------------
# STEP: Install the "text_hammer" library
# ---------------------------------------------------------------

# In Python, some tools or libraries are not included by default.
# We use pip to install them. The "!" at the start tells the notebook
# to run a shell command instead of Python code.

!pip install text_hammer

# ----------------- Key Points for Students -----------------
# 1. text_hammer is a library that helps clean and preprocess text data.
# 2. We only need to install it **once per notebook session**.
# 3. After installing, we can import and use it in Python like any other library:
#    import text_hammer as th




In [9]:
# ---------------------------------------------------------------
# STEP: Import the text_hammer library for text preprocessing
# ---------------------------------------------------------------

import text_hammer as th  # "th" is a short nickname so we can type less

# ----------------- Key Points for Students -----------------
# 1. text_hammer is a library that helps clean and process text for NLP.
#    For example, it can remove emojis, punctuation, extra spaces, etc.
# 2. By importing it as "th", we can use its functions easily:
#       cleaned_text = th.normalize_text("Some text here")
# 3. Make sure you have already run the previous cell:
#       !pip install text_hammer


In [10]:
# ---------------------------------------------------------------
# STEP: Preprocess text data to make it ready for NLP models
# ---------------------------------------------------------------

# %%time is a Jupyter/Colab magic command that measures how long this cell takes to run
%%time

# tqdm_notebook allows us to see a progress bar while applying functions to our DataFrame
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()  # Integrates tqdm with pandas so we get progress bars

# Define a function to clean and preprocess text
def text_preprocessing(df, col_name):
    """
    df       : pandas DataFrame containing the data
    col_name : name of the column that contains text to preprocess
    """
    column = col_name

    # ---------------- Step 1: Convert text to lowercase ----------------
    df[column] = df[column].progress_apply(lambda x: str(x).lower())
    # Makes all text lowercase so 'Happy' and 'happy' are treated the same

    # ---------------- Step 2: Expand contractions ----------------
    df[column] = df[column].progress_apply(lambda x: th.cont_exp(x))
    # Example: "you're" -> "you are", "i'm" -> "i am"

    # ---------------- Step 3: Remove emails ----------------
    df[column] = df[column].progress_apply(lambda x: th.remove_emails(x))
    # Removes any email addresses from the text

    # ---------------- Step 4: Remove HTML tags ----------------
    df[column] = df[column].progress_apply(lambda x: th.remove_html_tags(x))
    # Removes text like <p> or <br> that comes from web scraping

    # ---------------- Step 5: Remove special characters ----------------
    df[column] = df[column].progress_apply(lambda x: th.remove_special_chars(x))
    # Removes punctuation, symbols, emojis, etc.

    # ---------------- Step 6: Remove accented characters ----------------
    df[column] = df[column].progress_apply(lambda x: th.remove_accented_chars(x))
    # Converts letters like "é" to "e"

    # ---------------- Optional steps (commented out) ----------------
    # Remove stopwords (common words like 'the', 'and', 'is')
    # df[column] = df[column].progress_apply(lambda x: ps.remove_stopwords(x))

    # Convert words to their root form (lemmatization)
    # df[column] = df[column].progress_apply(lambda x: th.make_base(x)) # ran -> run

    return df  # Returns the cleaned DataFrame


CPU times: user 465 µs, sys: 0 ns, total: 465 µs
Wall time: 440 µs


In [11]:
# ---------------------------------------------------------------
# STEP: Apply the text preprocessing function to our full dataset
# ---------------------------------------------------------------

# df_full is the combined dataset of train, test, and validation data
# 'Input' is the column that contains the text we want to clean

# text_preprocessing() will:
# - convert text to lowercase
# - expand contractions ("you're" → "you are")
# - remove emails and HTML tags
# - remove special characters and accented letters
# - (optionally, remove stopwords or lemmatize)

df_cleaned = text_preprocessing(df_full, 'Input')

# ---------------- Quick Check ----------------
# Let's look at the first few rows to see the cleaned text
print(df_cleaned.head())


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

In [111]:
# ---------------------------------------------------------------
# STEP: Make a separate copy of the cleaned dataset
# ---------------------------------------------------------------

# df_cleaned = df_cleaned.copy() creates a **new independent copy** of the DataFrame
# Why we do this:
# 1. Prevents accidental changes to the original df_cleaned
# 2. Ensures that operations on this copy won't affect other variables
# 3. Good practice when working with large datasets or multiple versions

df_cleaned = df_cleaned.copy()


In [13]:
# ---------------------------------------------------------------
# STEP: Count how many words are in each text input
# ---------------------------------------------------------------

# We want a new column called 'num_words' that tells us how many words are in each text.

# df_cleaned.Input.apply(...) means: "for each text in the 'Input' column, do something"

# lambda x: ... is a **shortcut function**:
# - "x" represents one text from the 'Input' column
# - x.split() breaks the text into a list of words (splits at each space)
# - len(...) counts how many items are in the list (i.e., number of words)

df_cleaned['num_words'] = df_cleaned.Input.apply(lambda x: len(x.split()))

# Example:
# If x = "I love Python"
# x.split() → ["I", "love", "Python"]
# len(["I", "love", "Python"]) → 3
# So num_words = 3

# ---------------- Quick Check ----------------
print(df_cleaned[['Input', 'num_words']].head())


In [14]:
# ---------------------------------------------------------------
# STEP: Convert the 'Sentiment' column into a category (so we can make numbers)
# ---------------------------------------------------------------

# Why do we need numbers for modeling?
# - Computers do not understand words like "happy", "sad", or "angry".
# - Machine learning models can only work with **numbers**.
# - So we need to convert each label into a number:
#       "happy" -> 0
#       "sad"   -> 1
#       "angry" -> 2
# - This process is called **encoding**.

# How we do it in pandas:
# - By changing the data type to 'category', pandas knows these are categories.
# - Then we can easily turn them into numbers using .cat.codes later.

df_cleaned['Sentiment'] = df_cleaned.Sentiment.astype('category')

# ---------------- Quick Check ----------------
print(df_cleaned[['Sentiment']].head())  # Shows the first few sentiment labels
print(df_cleaned.dtypes)                # Confirms that Sentiment is now a 'category'


In [15]:
# ---------------------------------------------------------------
# STEP: Look at the 'Sentiment' column
# ---------------------------------------------------------------

# df_cleaned.Sentiment shows all the values in the 'Sentiment' column
# Right now, after converting to 'category', each value is still a label like "happy", "sad", "angry"
# But pandas knows these are **categories**, which makes it easy to convert them to numbers later

print(df_cleaned.Sentiment)

# ---------------- Key Points for Students ----------------
# - You can see all the sentiment labels in the dataset.
# - Even though they look like text, pandas treats them as categories internally.
# - Next, we can convert these categories into numeric codes using:
#       df_cleaned.Sentiment.cat.codes


Unnamed: 0,Sentiment
0,sadness
1,sadness
2,anger
3,love
4,anger
...,...
1995,sadness
1996,joy
1997,joy
1998,joy


In [16]:
# ---------------------------------------------------------------
# STEP: Convert sentiment labels into numbers for modeling
# ---------------------------------------------------------------

# df_cleaned.Sentiment.cat.codes converts the categorical labels into numeric codes
# Why we do this:
# - Machine learning models cannot understand text like "happy" or "sad"
# - They only understand numbers
# - So we assign each category a number:
#       "happy" -> 0
#       "sad"   -> 1
#       "angry" -> 2

# Example:
# If your Sentiment column had: ["happy", "sad", "angry", "happy"]
# Then cat.codes would give: [0, 1, 2, 0]

df_cleaned['Sentiment_codes'] = df_cleaned.Sentiment.cat.codes

# ---------------- Quick Check ----------------
print(df_cleaned[['Sentiment', 'Sentiment_codes']].head())


Unnamed: 0,0
0,4
1,4
2,0
3,3
4,0
...,...
1995,4
1996,2
1997,2
1998,2


In [17]:
# ---------------------------------------------------------------
# STEP: Create a dictionary to assign numbers to each sentiment
# ---------------------------------------------------------------

# Why do we use a dictionary here?
# - Computers cannot understand words like "anger" or "joy".
# - Machine learning models need numbers as labels.
# - A dictionary is an easy way to **map each word to a number**.

# How it works:
# - The left side (key) is the word: 'anger', 'fear', 'joy', etc.
# - The right side (value) is the number we assign to it: 0, 1, 2, etc.
# - Later we can replace each text label in the dataset with its number using this dictionary

encoded_dict = {
    'anger': 0,
    'fear': 1,
    'joy': 2,
    'love': 3,
    'sadness': 4,
    'surprise': 5
}

# Example:
# If a row in our dataset has Sentiment = 'joy', we can convert it to:
# encoded_dict['joy'] -> 2

# ---------------- Key Points for Students ----------------
# - A dictionary is like a **look-up table**: you give it a word, it returns a number.
# - This ensures that the same word always gets the same number.
# - It's especially useful if we want to manually control the order or number of labels.


In [18]:
# ---------------------------------------------------------------
# STEP: Turn the Sentiment words into numbers
# ---------------------------------------------------------------

# Right now, the Sentiment column has words like:
# "anger", "joy", "sadness", etc.
# Computers cannot understand these words directly.

# .cat.codes changes each word into a number automatically:
# For example:
# "anger" -> 0
# "fear"  -> 1
# "joy"   -> 2
# "love"  -> 3
# "sadness" -> 4
# "surprise" -> 5

df_cleaned['Sentiment'] = df_cleaned.Sentiment.cat.codes

# Check the result
# Now you should see numbers instead of words in the Sentiment column
df_cleaned.Sentiment


Unnamed: 0,Sentiment
0,4
1,4
2,0
3,3
4,0
...,...
1995,4
1996,2
1997,2
1998,2


In [19]:
# ---------------------------------------------------------------
# STEP: Find the longest text in terms of number of words
# ---------------------------------------------------------------

# df_cleaned.num_words.max() looks at the 'num_words' column
# and finds the **largest number**. This tells us the text that has
# the most words in the dataset.

# Example:
# If 'num_words' column has: [3, 5, 2, 7]
# df_cleaned.num_words.max() → 7

max_words = df_cleaned.num_words.max()
print("The longest text has", max_words, "words.")

# ---------------- Why do we need this? ----------------
# When training NLP models, we often need to decide how long the input can be.
# - Some models require a maximum number of words or tokens per text.
# - Knowing the longest text helps us **set this limit** so that:
#     1. We don’t cut off important information from longer texts
#     2. We don’t waste memory on extremely long texts
# - This is called setting the "max sequence length" in NLP models.


66

In [20]:
# ---------------------------------------------------------------
# STEP: Split the dataset into two parts: training and testing
# ---------------------------------------------------------------

# Why do we split the data?
# - We need some data for the model to **learn** (training set)
# - We need some data to **check if the model learned well** (testing set)
# - This way, we can see if the model works on data it hasn’t seen before

from sklearn.model_selection import train_test_split

# Split the data
# - test_size=0.3 → 30% of data goes to testing, 70% to training
# - random_state=42 → ensures the split is the same every time we run
# - stratify=df_cleaned.Sentiment → keeps all emotions in both sets
data_train, data_test = train_test_split(
    df_cleaned,
    test_size=0.3,
    random_state=42,
    stratify=df_cleaned.Sentiment
)

# Quick check
print("Training set size:", len(data_train))
print("Testing set size:", len(data_test))


In [112]:
# ---------------------------------------------------------------
# STEP: Why we check the shape of a dataset
# ---------------------------------------------------------------

# .shape tells us how many rows and columns we have in a dataset
# Rows = how many examples (text samples) we have
# Columns = how many pieces of information about each example

# Why check it?
# - To make sure we have the right amount of data
# - To see if the split worked correctly (training vs testing)
# - To catch mistakes early, like missing data or extra columns

print(data_train.shape)


(14000, 3)


In [113]:
data_test.shape

(6000, 3)

In [None]:
# ---------------------------------------------------------------
# STEP: Check if training and testing sizes are okay
# ---------------------------------------------------------------

# Training data = 14,000 examples
# Testing data  = 6,000 examples

# Is this good?
# - Yes, it’s a good split
# - Usually we give most of the data to training (so the model can learn well)
# - We keep some for testing to check if the model really learned (not too little, not too much)

# In this case:
# - Training = 70% (14,000) → model has enough to learn patterns
# - Testing  = 30% (6,000) → enough to check if it learned correctly


In [23]:
# We use to_categorical to change the numbers into a format the computer can understand
# Example: 0 becomes [1,0,0], 1 becomes [0,1,0], 2 becomes [0,0,1]

from tensorflow.keras.utils import to_categorical

# this is called “one-hot vectors”
# Each number gets its own box with 1 in the correct spot and 0s everywhere else
# Example for 3 classes:
# 0 → [1,0,0]
# 1 → [0,1,0]
# 2 → [0,0,1]

# We use one-hot vectors because computers like numbers in separate boxes
# Example: instead of just giving 0, 1, or 2, we give [1,0,0], [0,1,0], [0,0,1]
# This makes it clear which class each example belongs to

In [24]:
to_categorical(data_train.Sentiment)  # change the feeling labels into boxes of 0s and 1s so the computer understands them


array([[0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [27]:
from transformers import AutoTokenizer, TFBertModel

# Load a tool (tokenizer) that changes words into numbers BERT can read
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-mini")

# Load the small BERT model (bert-mini) so we can use it for our text
bert = TFBertModel.from_pretrained("prajjwal1/bert-mini", from_pt=True)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'bert.embeddings.position_ids', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [28]:
tokenizer.save_pretrained('bert-tokenizer')
bert.save_pretrained('bert-model')

# This saves the tokenizer and model into folders on your computer
# Why? So we don’t have to download and train them again later
# Next time, we can just "load" them quickly from these saved files


In [29]:
import shutil

shutil.make_archive('bert-tokenizer', 'zip', 'bert-tokenizer')

# This makes a ZIP file of the "bert-tokenizer" folder
# Why? To keep everything in one file so it’s easy to move, share, or store



'/content/bert-tokenizer.zip'

In [30]:
shutil.make_archive('bert-model','zip','bert-model')

# This makes a ZIP file of the "bert-model" folder
# So instead of many files, we now have one .zip file to save or share
# Think of it as packing the trained BERT model into a suitcase so you can take it anywhere.

'/content/bert-model.zip'

In [31]:
from transformers import AutoTokenizer, TFBertModel

# Load a tool (tokenizer) that changes words into numbers the BERT model can understand
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-mini")

# Load the small BERT model (bert-mini) so we can use it for our text
# from_pt=True means it was trained in PyTorch but we want to use it in TensorFlow
bert_model = TFBertModel.from_pretrained("prajjwal1/bert-mini", from_pt=True)



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'bert.embeddings.position_ids', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [32]:
tokenizer('hello this me brinda mailvaganam')

{'input_ids': [101, 7592, 2023, 2033, 11113, 24158, 5369, 2243, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [33]:
# ---------------------------------------------------------------
# STEP: Change sentences into numbers so BERT can understand
# ---------------------------------------------------------------

# Training sentences → numbers
x_train = tokenizer(
    text=data_train.Input.tolist(),  # all training sentences
    add_special_tokens=True,         # special markers BERT needs
    max_length=70,                   # make all sentences 70 words long
    truncation=True,                 # cut sentences longer than 70
    padding=True,                    # fill shorter sentences with 0s
    return_tensors='tf',             # make it usable for TensorFlow
    return_token_type_ids=False,     # not needed here
    return_attention_mask=True,      # tells BERT which numbers are real words
    verbose=True                     # show info while converting
)

# Testing sentences → numbers (same way)
x_test = tokenizer(
    text=data_test.Input.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


In [34]:
x_test['input_ids']  # These are the numbers that represent each word in your test sentences


<tf.Tensor: shape=(6000, 70), dtype=int32, numpy=
array([[ 101, 1045, 2360, ...,    0,    0,    0],
       [ 101, 1045, 2514, ...,    0,    0,    0],
       [ 101, 1045, 2939, ...,    0,    0,    0],
       ...,
       [ 101, 1045, 2031, ...,    0,    0,    0],
       [ 101, 1045, 2071, ...,    0,    0,    0],
       [ 101, 5665, 5610, ...,    0,    0,    0]], dtype=int32)>

In [35]:
from tensorflow.keras.optimizers import Adam            # Helps the model learn by adjusting numbers slowly
from tensorflow.keras.callbacks import EarlyStopping    # Stops training if the model stops improving
from tensorflow.keras.initializers import TruncatedNormal  # Helps start the model with good random numbers
from tensorflow.keras.losses import CategoricalCrossentropy  # Measures how wrong the model is for multi-class problems
from tensorflow.keras.metrics import CategoricalAccuracy     # Tells us how often the model predicts correctly
from tensorflow.keras.utils import to_categorical        # Turns labels into 0/1 boxes (one-hot vectors)


In [36]:
import tensorflow as tf

# Check if your computer can use a GPU (fast processor for training models)
tf.config.experimental.list_physical_devices('GPU')

# Output will show a GPU if your computer has one, otherwise it will be empty


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [40]:
from tensorflow.keras.layers import Lambda

# ---------------------------------------------------------------
# STEP: Turn the token numbers into BERT “embeddings” (understanding)
# ---------------------------------------------------------------

def bert_encoder(inputs):
    input_ids, attention_mask = inputs              # input_ids = numbers for words, attention_mask = tells BERT which are real words
    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    return outputs.last_hidden_state               # Get BERT’s understanding of each word

# Lambda layer applies our BERT encoder to the inputs
# embeddings will have shape: (batch_size, max_len, 256)
# 256 = how many numbers BERT uses to represent each word
embeddings = Lambda(bert_encoder, output_shape=(max_len, 256))([input_ids, attention_mask])


In [41]:
from tensorflow.keras.layers import GlobalMaxPool1D, Dense, Dropout

# Take the BERT embeddings (word meanings) and simplify them into one vector per sentence
out = GlobalMaxPool1D()(embeddings)

# Make a small “brain” to learn patterns
out = Dense(128, activation='relu')(out)   # 128 neurons, relu = simple on/off switch
out = Dropout(0.1)(out)                    # Randomly ignore 10% neurons to avoid overfitting
out = Dense(32, activation='relu')(out)    # Another smaller layer

# Final layer predicts 6 emotions, sigmoid gives numbers between 0 and 1
y = Dense(6, activation='sigmoid', dtype='float32')(out)

# Combine everything into a model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=y)



In [42]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

# Optimizer = helps the model learn step by step
optimizer = Adam(
    learning_rate=5e-5,  # how big each step is
    epsilon=1e-08,       # tiny number to avoid dividing by zero
    decay=0.01,          # slowly reduce step size over time
    clipnorm=1.0         # limit size of steps to prevent big jumps
)

# Loss = tells the model how wrong it is
loss = CategoricalCrossentropy(from_logits=False)

# Metric = tells us how often the model guesses correctly
metric = CategoricalAccuracy(name='balanced_accuracy')

# Combine everything into the model
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[metric]
)


In [43]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

# Optimizer = helps the model learn by adjusting its guesses
optimizer = Adam(
    learning_rate=5e-5,  # how big each learning step is
    epsilon=1e-08,       # tiny number to avoid errors in calculation
    decay=0.01,          # slowly reduce learning step over time
    clipnorm=1.0         # stop steps from being too big
)

# Loss = tells the model how wrong it is for training
loss = CategoricalCrossentropy(from_logits=False)

# Metric = tells us how often the model guesses correctly
metric = CategoricalAccuracy(name='balanced_accuracy')

# Put everything together to prepare the model for training
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[metric]
)


In [44]:
model.summary()

In [None]:
# Model summary explained simply:
# Layers = steps in the model:
#   input_ids & attention_mask → numbers from sentences
#   lambda_1 → BERT turns numbers into meaning vectors
#   GlobalMaxPooling1D → compress word info into one vector per sentence
#   Dense layers → small "brains" learning patterns
#   Dropout → prevents overfitting
#   dense_2 → predicts 6 emotions
# Output Shape = size of data at each step, e.g., (None, 70, 256) means 70 words each with 256 numbers, None = any number of sentences
# Param # = number of numbers the model will learn, e.g., 32,896 numbers in a Dense layer
# Trainable params = model can change these to learn
# Non-trainable params = fixed numbers
# Analogy = think of the model as a smart brain blueprint: layers = brain parts, params = connections that get smarter, output shapes = info size passing through


In [45]:
tf.config.experimental_run_functions_eagerly(True)
tf.config.run_functions_eagerly(True)

# Make TensorFlow run one step at a time so it’s easier to see what’s happening
# This is helpful when you are learning or debugging your code


Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


In [46]:
import tensorflow as tf

# Make TensorFlow run each step one by one (eager execution)
# This is helpful when learning or debugging your code
tf.config.run_functions_eagerly(True)


In [69]:
from tensorflow.keras.utils import to_categorical

# Convert labels (0,1,2,...) into one-hot format (e.g., 0 → [1,0,0], 1 → [0,1,0])
y_train = to_categorical(data_train.Sentiment)
y_test = to_categorical(data_test.Sentiment)

# Train the model
train_history = model.fit(
    x={
        'input_ids': x_train['input_ids'],         # Numbers for words in training sentences
        'attention_mask': x_train['attention_mask']  # Tells BERT which numbers are real words
    },
    y=y_train,                                     # The correct answers in one-hot format
    validation_data=(
        {
            'input_ids': x_test['input_ids'],      # Numbers for words in test sentences
            'attention_mask': x_test['attention_mask']
        },
        y_test                                     # Test labels in one-hot format
    ),
    batch_size=36,                                 # How many sentences to look at before updating the model
    epochs=1                                       # Number of times to go through all the training sentences
)



Epoch 1/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 324ms/step - balanced_accuracy: 0.3761 - loss: 1.5774 - val_balanced_accuracy: 0.3963 - val_loss: 1.5349
Epoch 2/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 322ms/step - balanced_accuracy: 0.4050 - loss: 1.5443 - val_balanced_accuracy: 0.4293 - val_loss: 1.5094
Epoch 3/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 319ms/step - balanced_accuracy: 0.4260 - loss: 1.5105 - val_balanced_accuracy: 0.4447 - val_loss: 1.4841
Epoch 4/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 322ms/step - balanced_accuracy: 0.4322 - loss: 1.4867 - val_balanced_accuracy: 0.4503 - val_loss: 1.4613
Epoch 5/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 321ms/step - balanced_accuracy: 0.4456 - loss: 1.4723 - val_balanced_accuracy: 0.4618 - val_loss: 1.4404
Epoch 6/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 3

In [70]:
# Save the model’s learned numbers (weights) to a file
# You can use these later to load the model without training again
model.save_weights('sentiment_weights.weights.h5')


In [71]:
# Use the model to make predictions on the test sentences
predicted_raw = model.predict({
    'input_ids': x_test['input_ids'],           # Numbers for words in test sentences
    'attention_mask': x_test['attention_mask']  # Tells BERT which numbers are real words
})

# predicted_raw will be numbers between 0 and 1 for each of the 6 emotions


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 104ms/step


In [72]:
predicted_raw[0]  # Look at the model’s guesses for the FIRST test sentence


array([0.5939938 , 0.41179305, 0.53244567, 0.33479246, 0.498105  ,
       0.2211624 ], dtype=float32)

In [73]:
# Pick the emotion with the highest score for each sentence
y_predicted = np.argmax(predicted_raw, axis=1)

# np.argmax finds the position of the biggest number in each list
# axis=1 means we look across each sentence’s 6 emotion scores


In [74]:
data_test.Sentiment  # Shows the actual emotions (labels) of the test sentences


Unnamed: 0,Sentiment
10486,0
1320,0
10848,2
195,4
2093,4
...,...
1241,4
8642,1
1775,2
5733,0


In [75]:
from sklearn.metrics import classification_report  # Import a tool to check how well the model did

# This will give detailed results like:
# - precision → how many predicted positives were correct
# - recall → how many actual positives we correctly predicted
# - f1-score → balance of precision and recall
# - support → number of sentences for each emotion



In [76]:
# Compare the model’s guesses with the correct answers and print a report
print(classification_report(data_test.Sentiment, y_predicted))

# This will show a “report card” for the model:
# Each row = one emotion (0 to 5)
# precision → how many times the model’s guess for this emotion was correct
# recall → how many of the actual sentences of this emotion the model found
# f1-score → a mix of precision and recall (overall “quality” for that emotion)
# support → how many sentences really belong to that emotion



              precision    recall  f1-score   support

           0       0.38      0.05      0.09       813
           1       0.46      0.10      0.17       712
           2       0.55      0.70      0.61      2028
           3       0.25      0.00      0.01       492
           4       0.43      0.77      0.55      1739
           5       0.00      0.00      0.00       216

    accuracy                           0.48      6000
   macro avg       0.34      0.27      0.24      6000
weighted avg       0.44      0.48      0.40      6000



In [92]:
# Ask the user to type a sentence
texts = input("Type your sentence: ")

# Turn the sentence into numbers BERT can understand
x_val = tokenizer(
    text=texts,
    add_special_tokens=True,   # Add special markers BERT needs
    max_length=70,             # Limit to 70 words
    truncation=True,           # Cut off if too long
    padding='max_length',      # Add extra spaces if too short
    return_tensors='tf',       # Give output in TensorFlow format
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

# Ask the model to guess the emotion of the input sentence
validation = model.predict({
    'input_ids': x_val['input_ids'],
    'attention_mask': x_val['attention_mask']
}) * 100  # Multiply by 100 to show percentages

validation  # Shows how confident the model is for each of the 6 emotions


input the texti am scared of the dark
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step


array([[47.90278 , 59.39865 , 39.243797, 34.456135, 69.73165 , 17.301186]],
      dtype=float32)

In [93]:
# Show the model’s confidence for each emotion
for key, value in zip(encoded_dict.keys(), validation[0]):
    print(key, value)  # key = emotion name, value = model’s confidence percentage


anger 47.90278
fear 59.39865
joy 39.243797
love 34.456135
sadness 69.73165
surprise 17.301186


In [96]:
# Find the emotion with the highest confidence
predicted_idx = np.argmax(validation[0])  # index of the biggest number

# Get the emotion name from the dictionary using that index
predicted_emotion = list(encoded_dict.keys())[predicted_idx]

# Show the final predicted emotion
print("Predicted emotion:", predicted_emotion)


Predicted emotion: sadness


In [97]:
################# EXERCISE

In [None]:
#Step 1: Prepare multiple inputs

#Instead of a single sentence, test multiple sentences at once:

In [98]:
# Sample sentences to test
texts_list = [
    "I love pizza and ice cream!",
    "I am really scared of the dark.",
    "That movie made me angry.",
    "I feel so sad about the news.",
    "What a wonderful surprise party!",
    "I hate getting stuck in traffic."
]

In [99]:
# Tokenize the input texts BEFORE running the model
# We need to convert raw text into numbers that BERT understands (tokens)
x_val = tokenizer(
    text=texts_list,                    # The text(s) we want to classify; can be a single sentence or a list
    add_special_tokens=True,             # Add [CLS] at start and [SEP] at end, required by BERT for classification tasks
    max_length=70,                       # Ensure all sequences have a fixed length for batch processing
    truncation=True,                     # Cut off tokens beyond max_length; prevents errors from too-long sentences
    padding='max_length',                # Pad shorter sequences to max_length; keeps input shapes consistent
    return_tensors='tf',                 # Convert token IDs to TensorFlow tensors; needed for model input
    return_token_type_ids=False,         # Not needed for single-sentence tasks; BERT can differentiate sentences if True
    return_attention_mask=True,          # Create mask to tell BERT which tokens are real and which are padding
    verbose=True                         # Optional: print tokenization progress for debugging
)


In [103]:
#Get Model Predictions

In [105]:
validation = model.predict({
    'input_ids': x_val['input_ids'],
    'attention_mask': x_val['attention_mask']
})

# Apply sigmoid to convert logits to probabilities
probs = 1 / (1 + np.exp(-validation))
probs_percent = probs * 100  # optional: convert to percentage


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step


In [106]:
emotions = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

for i, sentence in enumerate(texts_list):
    print(f"\nInput: {sentence}")
    for e, p in zip(emotions, probs_percent[i]):
        print(f"{e}: {p:.2f}%")
    predicted_idx = np.argmax(probs[i])
    print("Predicted emotion:", emotions[predicted_idx])



Input: I love pizza and ice cream!
anger: 56.63%
fear: 57.04%
joy: 66.18%
love: 59.41%
sadness: 64.27%
surprise: 55.14%
Predicted emotion: joy

Input: I am really scared of the dark.
anger: 60.11%
fear: 64.74%
joy: 61.33%
love: 56.34%
sadness: 66.28%
surprise: 54.44%
Predicted emotion: sadness

Input: That movie made me angry.
anger: 66.11%
fear: 61.58%
joy: 59.20%
love: 56.97%
sadness: 66.01%
surprise: 56.89%
Predicted emotion: anger

Input: I feel so sad about the news.
anger: 61.17%
fear: 60.24%
joy: 61.88%
love: 56.19%
sadness: 67.23%
surprise: 55.27%
Predicted emotion: sadness

Input: What a wonderful surprise party!
anger: 62.24%
fear: 59.58%
joy: 67.69%
love: 61.97%
sadness: 63.52%
surprise: 60.25%
Predicted emotion: joy

Input: I hate getting stuck in traffic.
anger: 65.47%
fear: 55.48%
joy: 58.09%
love: 53.17%
sadness: 69.26%
surprise: 53.85%
Predicted emotion: sadness
