<a href="https://colab.research.google.com/github/1310-Akagami/1310-Akagami/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
df = pd.read_csv("/content/emotion_dataset.csv")

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
df["cleaned_text"] = df["Text"].astype(str).str.lower()
df.head()

In [None]:
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
df.head()

In [None]:
nltk.download('punkt_tab')

In [None]:
df["cleaned_text"] = df["cleaned_text"].apply(word_tokenize)
df.head()

In [None]:
stop_words = set(stopwords.words('english'))
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: [word for word in x if word not in stop_words])
df.head()

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

df["cleaned_text"] = df["cleaned_text"].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

df.head()

In [None]:
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: " ".join(x))
df.head()

In [None]:
df['Emotion'].unique()

In [None]:
emotion_mapping={
    'joy':'positive',
    'neutral':'neutral',
    'sadness':'negative',
    'anger':'negative',
    'fear':'negative',
    'surprise':'neutral',
    'disgust':'negative',
    'shame':'negative'
}

df['Sentiment'] = df['Emotion'].map(emotion_mapping)
df.head()

In [None]:
df['Sentiment'].unique()

In [None]:
from sklearn.utils import resample
df_negative = df[df['Sentiment'] == 'negative']
df_neutral = df[df['Sentiment'] == 'neutral']
df_positive = df[df['Sentiment'] == 'positive']

df_positive_upsampled = resample(df_positive, replace=True, n_samples=len(df_negative), random_state=42)
df_neutral_upsampled = resample(df_neutral, replace=True, n_samples=len(df_negative), random_state=42)

df_balanced = pd.concat([df_negative, df_positive_upsampled, df_neutral_upsampled])
df_balanced=df_balanced.sample(frac=1).reset_index(drop=True)
df_balanced.head()

Unnamed: 0,Emotion,Text,cleaned_text,Sentiment
0,joy,I was doing an experiment and was not getting ...,experiment getting appropriate result despite ...,positive
1,joy,You are afraid and full of excitement at thoug...,afraid full excitement thought might find,positive
2,joy,She was already feeling a bit better and she f...,already feeling bit better felt quite amused c...,positive
3,surprise,Dit weekend ondersteuning bieden aan kind bij ...,dit weekend ondersteuning bieden aan kind bij ...,neutral
4,neutral,Positive 。,positive,neutral


In [None]:
df_balanced['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
positive,17431
neutral,17431
negative,17431


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-03-04 05:08:29--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-04 05:08:29--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-04 05:08:29--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import numpy as np
import pandas as pd

# Load pre-trained GloVe embeddings
def load_glove_embeddings(glove_path, embedding_dim=300):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

# Convert text data to GloVe embeddings
def text_to_glove(texts, embeddings, embedding_dim=300):
    vectors = []
    for text in texts:
        if isinstance(text, str) and text.strip():  # Check if text is valid
            words = text.split()  # Tokenization
            word_vectors = [embeddings[word] for word in words if word in embeddings]
            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))  # Sentence vector by averaging word vectors
            else:
                vectors.append(np.zeros(embedding_dim))  # Use zero vector if no words are found
        else:
            vectors.append(np.zeros(embedding_dim))  # Handle empty/missing values
    return np.array(vectors)

# Load GloVe (adjust the path based on your file location)
glove_path = "glove.6B.300d.txt"  # Example: Use GloVe 300-dimensional embeddings
glove_embeddings = load_glove_embeddings(glove_path, embedding_dim=300)

# Convert cleaned text to GloVe vectors
glove_matrix = text_to_glove(df_balanced["cleaned_text"].fillna(""), glove_embeddings, embedding_dim=300)

# Convert to DataFrame and keep the Emotion column
glove_df = pd.DataFrame(glove_matrix)
glove_df["Sentiment"] = df_balanced["Sentiment"]  # Add emotion labels back

glove_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Emotion
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral
1,-0.170938,-0.209044,0.113614,0.17194,0.117301,0.033579,0.120444,0.236349,0.145778,-0.734794,...,-0.271766,-0.041954,-0.078246,-0.123567,-0.130706,-0.301457,-0.191768,0.088089,-0.099215,joy
2,-0.166604,-0.09639,-0.084806,0.076559,-0.162257,0.158951,0.086146,0.008101,-0.151103,-0.737579,...,-0.070999,0.128084,0.051868,-0.125505,-0.316712,0.195995,0.057517,-0.049985,0.153528,sadness
3,-0.126991,0.011423,-0.018515,-0.073564,0.010722,0.140782,-0.032093,0.000814,0.014952,-1.104113,...,0.019277,-0.018301,-0.095924,0.101733,-0.116273,0.039069,-0.00059,0.054296,0.021246,joy
4,-0.288506,0.034852,-0.259154,0.199023,-0.389844,-0.180626,-0.226715,-0.026402,-0.443477,-0.236634,...,0.037723,-0.02169,-0.218963,-0.2843,-0.314878,0.254091,-0.273489,0.098884,0.022543,joy


In [None]:
X=glove_df.drop('Emotion',axis=1)
y=glove_df['Emotion']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check Shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((27833, 300), (6959, 300), (27833,), (6959,))

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred_dt)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='weighted')

# Print results
print("🔹 Decision Tree Performance Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")

🔹 Decision Tree Performance Metrics:
Accuracy  : 0.4167
Precision : 0.4169
Recall    : 0.4167
F1 Score  : 0.4165


In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train, y_train)

# Predict on test data
y_pred_nb = nb_model.predict(X_test)  # Changed y_pred to y_pred_nb

# Compute performance metrics (handling zero division)
accuracy = accuracy_score(y_test, y_pred_nb)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_nb, average='weighted', zero_division=1)

# Print results
print(" Naïve Bayes Performance Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.callbacks import EarlyStopping

# --- Data Preprocessing ---

# Convert categorical labels (e.g., 'joy', 'sadness') into numerical labels
label_encoder = LabelEncoder()
y_train_numeric = label_encoder.fit_transform(y_train)
y_test_numeric = label_encoder.transform(y_test)

# Convert X_train and X_test to dense NumPy arrays (if they're sparse matrices)
X_train_dense = X_train.sparse.to_dense()  # Convert to dense format
X_test_dense = X_test.sparse.to_dense()    # Convert to dense format

# One-hot encode the numeric labels
num_classes = len(np.unique(y_train_numeric))  # Dynamically detect number of classes
y_train_encoded = to_categorical(y_train_numeric, num_classes=num_classes)
y_test_encoded = to_categorical(y_test_numeric, num_classes=num_classes)

# Define the Model with Updated Hyperparameters
model = Sequential()

# First layer with L2 regularization and increased dropout
model.add(Dense(64, input_shape=(X_train_dense.shape[1],), activation='relu',
                kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(0.4))

# Second layer
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(0.4))

# Third layer
model.add(Dense(16, activation='relu', kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())

# Output layer
model.add(Dense(num_classes, activation='softmax'))

# Compile the model with a slightly reduced learning rate if needed
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=['accuracy'])

# --- Define Early Stopping Callback ---
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# --- Train the Model with Validation Split and Early Stopping ---
history = model.fit(
    X_train_dense,
    y_train_encoded,
    batch_size=64,
    epochs=8,
    validation_split=0.2,  # Using 20% of the training data for validation
    callbacks=[early_stopping]
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_dense, y_test_encoded, verbose=0)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
