In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import TFAutoModel,AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load dataset
df = pd.read_csv("merged_data.csv")
df = df[['text', 'label']]

In [None]:
df

In [None]:
emotions_len=len(df['label'].unique())
emotions_len


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Count values
label_counts = df['label'].value_counts()
percentages = (label_counts / label_counts.sum()) * 100  # Convert to percentage

# Create a figure
fig, ax = plt.subplots(figsize=(10, 6))

# Pie chart
ax.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', colors=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
ax.set_title("Distribution of Labels")

# Create a table
table_data = {'Label': label_counts.index, 'Count': label_counts.values, 'Percentage': percentages.round(1)}
table_df = pd.DataFrame(table_data)

# Add table to plot
table = plt.table(cellText=table_df.values, colLabels=table_df.columns, cellLoc='center', loc='right')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.5)

# Show plot
plt.show()


In [None]:
import re
import string
#Preprocess text
def preprocess_text(text):
    text=text.lower()
    text=re.sub('[^a-z A-Z 0-9]','',text)
    text= text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text= re.sub(r'\d+', '', text)  # Remove numbers
    return text.strip()

In [None]:
df['text']=df['text'].apply(lambda x: preprocess_text(x))


In [None]:
df["text"]

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform labels
df["label"] = label_encoder.fit_transform(df["label"])

# Get label mappings
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print("Label Encoding Mapping:")
for label, index in label_mapping.items():
    print(f"{label}: {index}")


In [None]:
df["label"]

In [None]:
#load the RoBERTa tokenizer
tokenizer=AutoTokenizer.from_pretrained('roberta-base')

In [None]:
def tokenize_text(texts, max_length=128):
    return tokenizer(texts.tolist(), padding="max_length", truncation=True, max_length=max_length)

In [None]:
tokens = tokenize_text(df["text"])
input_ids, attention_mask = tokens["input_ids"], tokens["attention_mask"]

In [None]:
len(input_ids[0])

In [None]:
# Train-test split
X_train, X_test, attn_train, attn_test, y_train, y_test = train_test_split(
    input_ids, attention_mask, df["label"].values, test_size=0.2, random_state=42)

In [None]:
# Convert labels to categorical

y_train = tf.keras.utils.to_categorical(y_train, emotions_len)
y_test = tf.keras.utils.to_categorical(y_test, emotions_len)

In [None]:
y_train.shape

In [None]:
# Load RoBERTa model
roberta_model = TFAutoModel.from_pretrained("roberta-base")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
from transformers import TFRobertaModel, RobertaTokenizer
import torch

class RoBERTaEmbedding(tf.keras.layers.Layer):
    def __init__(self, model):
        super(RoBERTaEmbedding, self).__init__()
        self.model = model

    def call(self, inputs):
        input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return output.last_hidden_state  # Extract output
    # Explicitly define the output shape
    def compute_output_shape(self, input_shape):
        return (input_shape["input_ids"][0], input_shape["input_ids"][1], 768)  # (batch_size, seq_length, 768)



In [None]:
from tensorflow.keras.layers import Dense,Input,LSTM

def build_model(emotions_len, max_length=128): 
    input_layer=Input(shape=(max_length,),dtype=tf.int32,name="input_ids")
    attention_mask_layer=Input(shape=(max_length,),dtype=tf.int32,name="attention_mask")
    roberta_layer = RoBERTaEmbedding(roberta_model)
    roberta_output = roberta_layer({"input_ids": input_layer, "attention_mask": attention_mask_layer})
    lstm_output=LSTM(128,return_sequences=False)(roberta_output)
    dense_output = Dense(64, activation='relu')(lstm_output)
    final_output = Dense(emotions_len, activation='softmax')(dense_output)
    
    
    model = tf.keras.Model(inputs=[input_layer,attention_mask_layer], outputs=final_output)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), metrics=['accuracy'])
    return model

In [None]:
model=build_model(emotions_len, 128)


In [None]:
model.summary()

In [None]:
# # Train model
# model.fit(
#     [X_train, attn_train], y_train,
#     validation_data=([X_test, attn_test], y_test),
#     epochs=3,
#     verbose=1
# )
2

In [None]:
print("hello")