In [None]:
import warnings 
warnings.filterwarnings('ignore')

In [None]:
import os
import numpy as np                 
import pandas as pd                
from scipy.signal import resample
from tensorflow.keras.utils import to_categorical 
from sklearn.model_selection import train_test_split

# Model
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.utils import to_categorical

In [None]:
input_text = []
labels = []

folder_path = '/kaggle/input/text-sentiment-analysis/Text_Dataset' 

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)

        df = pd.read_csv(file_path)
        input_text.extend(df['content'].tolist())  
        labels.extend(df['sentiment'].tolist())   

data = pd.DataFrame({'input': input_text, 'sentiment': labels})
data.head()

In [None]:
data['input'] = data['input'].apply(lambda x: str(x) if not isinstance(x, str) else x)
data['num_words'] = data['input'].apply(lambda x: len(x.split()))

In [None]:
data.dropna(inplace = True)

In [None]:
labels = list(data.sentiment.unique())
data['sentiment'] = data.sentiment.astype('category')
data['sentiment'] = data.sentiment.cat.codes
data['sentiment'] = data.sentiment.astype('int')
labels_encoded = list(data.sentiment.unique())
sentiment_mapping = {key: int(value) for key, value in zip(labels, labels_encoded)}

In [None]:
sentiment_mapping

In [None]:
data.info()

In [None]:
data.head()

In [None]:
inverted_sentiment_mapping = {v: k for k, v in sentiment_mapping.items()}

data['label_name'] = data['sentiment'].map(inverted_sentiment_mapping)

data.head()

In [None]:
import matplotlib.pyplot as plt

data["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
data_cleaned = data.dropna(subset=['input'])

data_cleaned["Words Per Tweet"] = data_cleaned["input"].str.split().apply(len)
data_cleaned.boxplot("Words Per Tweet", by="label_name", grid=False, showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
data_train ,data_test  = train_test_split(data , test_size = 0.3, random_state = 42, stratify = data.sentiment)

In [None]:
data_train.shape

In [None]:
data_test.shape

In [None]:
data_train.head()

In [None]:
from transformers import AutoTokenizer

model_ckpt = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def tokenize(batch):
    return tokenizer(batch["input"], padding=True, truncation=True)

In [None]:
tokenizer('hello this me Abozaid')

In [None]:
#hide_input
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

In [None]:
x_train = tokenizer(
    text = data_train.input.tolist(),
    truncation = True,  
    padding = True)

In [None]:
x_test = tokenizer(
    text = data_test.input.tolist(),
    truncation = True,  
    padding = True)   

In [None]:
text_train = list(data_train.input)
label_train = list(data_train.sentiment)
input_ids_train = x_train["input_ids"]
attn_mask_train = x_train["attention_mask"]

train_dataset = pd.DataFrame({'text': text_train, 'label': label_train, 'input_ids': input_ids_train, 'attention_mask': attn_mask_train})
train_dataset.head()

In [None]:
text_test = list(data_test.input)
label_test = list(data_test.sentiment)
input_ids_test = x_test["input_ids"]
attn_mask_test = x_test["attention_mask"]

test_dataset = pd.DataFrame({'text': text_test, 'label': label_test, 'input_ids': input_ids_test, 'attention_mask': attn_mask_test})
test_dataset.head()

In [None]:
test_dataset.keys()

In [None]:
train_dataset.keys()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class EmotionDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row['text']
        label = row['label']
        input_ids = torch.tensor(row['input_ids'])
        attention_mask = torch.tensor(row['attention_mask'])
        return {
            'text': text,
            'label': label,
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }
train_ds = EmotionDataset(train_dataset)
test_ds = EmotionDataset(test_dataset)

In [None]:
from transformers import AutoModelForSequenceClassification
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_labels = 6
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels, id2label=inverted_sentiment_mapping, label2id=sentiment_mapping)
         .to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
len(data_train.input)

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(data_train.input) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  report_to = "none")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_ds,
                  eval_dataset=test_ds,
                  tokenizer=tokenizer)
trainer.train() 

In [None]:
pred_output = trainer.predict(test_ds)

In [None]:
pred_output.metrics

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
from scipy.special import softmax

true_labels = pred_output.label_ids
predicted_labels = np.argmax(softmax(pred_output.predictions, axis=1), axis=1)

cm = confusion_matrix(true_labels, predicted_labels)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# {'fear': 2, 'happy': 3, 'sad': 5, 'neutral': 4, 'angry': 0, 'disgust': 1}
labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad']  

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
new_text = "im happy today"  
inputs = tokenizer(new_text, return_tensors="pt", padding=True, truncation=True, max_length=70)

In [None]:
inputs = {key: value.to(device) for key, value in inputs.items()}
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
probabilities = softmax(logits.cpu().numpy()[0])

inverted_sentiment_mapping = {v: k for k, v in sentiment_mapping.items()}

formatted_output = "\n".join(f"{inverted_sentiment_mapping[i]} {probabilities[i]*100:.6f}" for i in range(len(probabilities)))

print(formatted_output)