<a href="https://colab.research.google.com/github/Bibhu0203/SemEval-2024/blob/main/Model2_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install and Import Dependencies
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau

print("Dependencies imported.")

# Step 2: Load Dataset
train_file_path = '/content/eng.csv'  # Update this path if necessary
data = pd.read_csv(train_file_path)
data.head()

# Step 3: Preprocess Data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

data['text'] = data['text'].apply(clean_text)

X = data['text']
y = data[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Tokenize and Pad Sequences
max_words = 10000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')

X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_sequence_length, padding='post')

# Step 5: Define and Train the Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(5, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# Learning rate reduction callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)

# Train the model
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_val_padded, y_val), callbacks=[reduce_lr])

# Step 6: Load and Preprocess the Test Dataset
test_file_path = '/content/eng_a.csv'  # Update this path
test_data = pd.read_csv(test_file_path)

# Preprocess the test data
test_data['text'] = test_data['text'].apply(clean_text)
X_test = test_data['text']
y_test = test_data[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']]

# Tokenize and pad the test sequences
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Step 7: Evaluate the Model with F1 Score
def evaluate_model_f1(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_binary = (y_pred > 0.5).astype(int)

    f1_scores = f1_score(y_test, y_pred_binary, average=None)
    average_f1 = f1_score(y_test, y_pred_binary, average='macro')

    print("F1 Score for each class:", f1_scores)
    print("Average F1 Score:", average_f1)

    return f1_scores, average_f1

# Evaluate the model
f1_scores, average_f1 = evaluate_model_f1(model, X_test_padded, y_test)

# Step 8: Predict Emotions with the Model
def predict_emotions(model, tokenizer, max_sequence_length):
    user_input = input("Enter a text to analyze the emotions: ")
    cleaned_text = clean_text(user_input)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')

    prediction = model.predict(padded_sequence)[0]
    emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
    predicted_labels = (prediction > 0.5).astype(int)

    print("\nEmotion Predictions:")
    for i, emotion in enumerate(emotions):
        if predicted_labels[i] == 1:
            print(f"{emotion}: Present")
        else:
            print(f"{emotion}: Not Present")

    return predicted_labels

# Predict emotions using the model
predict_emotions(model, tokenizer, max_sequence_length)


Dependencies imported.


None
Epoch 1/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 252ms/step - accuracy: 0.4198 - loss: 0.6256 - val_accuracy: 0.4874 - val_loss: 0.5652 - learning_rate: 0.0010
Epoch 2/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 264ms/step - accuracy: 0.4983 - loss: 0.5740 - val_accuracy: 0.4874 - val_loss: 0.5586 - learning_rate: 0.0010
Epoch 3/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 242ms/step - accuracy: 0.4790 - loss: 0.5533 - val_accuracy: 0.4801 - val_loss: 0.5435 - learning_rate: 0.0010
Epoch 4/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 266ms/step - accuracy: 0.5098 - loss: 0.4929 - val_accuracy: 0.4278 - val_loss: 0.5538 - learning_rate: 0.0010
Epoch 5/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 240ms/step - accuracy: 0.5682 - loss: 0.4424 - val_accuracy: 0.4368 - val_loss: 0.5866 - learning_rate: 0.0010
Epoch 6/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

  return x.astype(dtype, copy=copy, casting=casting)
  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Input y_true contains NaN.

In [None]:
# Step 1: Install and Import Dependencies
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau

print("Dependencies imported.")

# Step 2: Load Dataset
file_path = '/content/eng.csv'  # Update this path if necessary
data = pd.read_csv(file_path)
data.head()

# Step 3: Preprocess Data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

data['text'] = data['text'].apply(clean_text)

X = data['text']
y = data[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']]

# Remove rows with NaN values from y
data = data.dropna(subset=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])
X = data['text']
y = data[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Tokenize and Pad Sequences
max_words = 10000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Step 5: Define and Train the Improved Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(5, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# Learning rate reduction callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)

# Train the model
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[reduce_lr])

# Step 6: Evaluate the Improved Model with F1 Score
def evaluate_model_f1(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_binary = (y_pred > 0.5).astype(int)

    f1_scores = f1_score(y_test, y_pred_binary, average=None)
    average_f1 = f1_score(y_test, y_pred_binary, average='macro')

    print("F1 Score for each class:", f1_scores)
    print("Average F1 Score:", average_f1)

    return f1_scores, average_f1

# Remove rows with NaN values from y_test
valid_indices = ~y_test.isna().any(axis=1)
X_test_padded = X_test_padded[valid_indices]
y_test = y_test[valid_indices]

# Evaluate the model
f1_scores, average_f1 = evaluate_model_f1(model, X_test_padded, y_test)

# Step 7: Predict Emotions with Improved Model
def predict_emotions(model, tokenizer, max_sequence_length):
    user_input = input("Enter a text to analyze the emotions: ")
    cleaned_text = clean_text(user_input)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')

    prediction = model.predict(padded_sequence)[0]
    emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
    predicted_labels = (prediction > 0.5).astype(int)

    print("\nEmotion Predictions:")
    for i, emotion in enumerate(emotions):
        if predicted_labels[i] == 1:
            print(f"{emotion}: Present")
        else:
            print(f"{emotion}: Not Present")

    return predicted_labels

# Predict emotions using the improved model
predict_emotions(model, tokenizer, max_sequence_length)


Dependencies imported.


None
Epoch 1/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 268ms/step - accuracy: 0.3596 - loss: 0.6335 - val_accuracy: 0.4989 - val_loss: 0.5809 - learning_rate: 0.0010
Epoch 2/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 246ms/step - accuracy: 0.4327 - loss: 0.5859 - val_accuracy: 0.4989 - val_loss: 0.5794 - learning_rate: 0.0010
Epoch 3/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 251ms/step - accuracy: 0.4644 - loss: 0.5817 - val_accuracy: 0.4989 - val_loss: 0.5729 - learning_rate: 0.0010
Epoch 4/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 244ms/step - accuracy: 0.4939 - loss: 0.5411 - val_accuracy: 0.4718 - val_loss: 0.5671 - learning_rate: 0.0010
Epoch 5/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 252ms/step - accuracy: 0.4273 - loss: 0.4795 - val_accuracy: 0.3521 - val_loss: 0.5813 - learning_rate: 0.0010
Epoch 6/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

array([0, 1, 0, 0, 1])