In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/Data/tweet_emotions.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,Id,Tweet,Label
0,145353048817012000,Thinks that @melbahughes had a great 50th birt...,surprise
1,144279638024257000,"Como una expresiÃ³n tan simple, una sola oraci...",sadness
2,140499585285111000,the moment when you get another follower and y...,joy
3,145207578270507000,Be the greatest dancer of your life! practice ...,joy
4,139502146390470000,eww.. my moms starting to make her annual rum ...,disgust


In [None]:

df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
joy,8240
surprise,3849
sadness,3830
fear,2816
anger,1555
disgust,761


In [None]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical


tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Tweet'])
sequences = tokenizer.texts_to_sequences(df['Tweet'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)
unique_labels = df['Label'].unique()
print("Unique Labels:", unique_labels)
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(unique_labels)
label_sequences = label_tokenizer.texts_to_sequences(df['Label'])
labels = to_categorical([item[0] - 1 for item in label_sequences])

print(f'Padded Sequences Shape: {padded_sequences.shape}')
print(f'Labels Shape: {labels.shape}')


Unique Labels: ['surprise' 'sadness' 'joy' 'disgust' 'fear' 'anger']
Padded Sequences Shape: (21051, 100)
Labels Shape: (21051, 6)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, stratify=labels, random_state=42)
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')


X_train shape: (16840, 100), y_train shape: (16840, 6)
X_test shape: (4211, 100), y_test shape: (4211, 6)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(6, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.summary()


In [None]:

history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

train_loss, train_acc = model.evaluate(X_train, y_train)
test_loss, test_acc = model.evaluate(X_test, y_test)

print(f'Training Accuracy: {train_acc}')
print(f'Testing Accuracy: {test_acc}')


Epoch 1/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.3781 - loss: 1.5931 - val_accuracy: 0.3916 - val_loss: 1.5501
Epoch 2/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.3975 - loss: 1.5246 - val_accuracy: 0.4654 - val_loss: 1.3842
Epoch 3/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.4856 - loss: 1.3369 - val_accuracy: 0.4951 - val_loss: 1.2799
Epoch 4/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.5232 - loss: 1.2292 - val_accuracy: 0.4804 - val_loss: 1.3258
Epoch 5/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.5513 - loss: 1.1793 - val_accuracy: 0.5049 - val_loss: 1.3072
Epoch 6/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5826 - loss: 1.1062 - val_accuracy: 0.5555 - val_loss: 1.1807
Epoch 7/10
[1m527/527[0m 

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

report = classification_report(y_test_classes, y_pred_classes, target_names=label_tokenizer.word_index.keys())
print(report)


[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
              precision    recall  f1-score   support

    surprise       0.62      0.43      0.51       770
     sadness       0.41      0.56      0.47       766
         joy       0.63      0.84      0.72      1649
     disgust       0.36      0.03      0.06       152
        fear       0.75      0.45      0.56       563
       anger       0.63      0.21      0.31       311

    accuracy                           0.59      4211
   macro avg       0.57      0.42      0.44      4211
weighted avg       0.60      0.59      0.56      4211

