TASK-01 Tweet Emotions Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/datasets/tweet_emotions.csv')

print(df.head())


                   Id                                              Tweet  \
0  145353048817012000  Thinks that @melbahughes had a great 50th birt...   
1  144279638024257000  Como una expresiÃ³n tan simple, una sola oraci...   
2  140499585285111000  the moment when you get another follower and y...   
3  145207578270507000  Be the greatest dancer of your life! practice ...   
4  139502146390470000  eww.. my moms starting to make her annual rum ...   

      Label  
0  surprise  
1   sadness  
2       joy  
3       joy  
4   disgust  


In [None]:
class_counts = df['Label'].value_counts()
print("Number of instances for each class label:")
print(class_counts)


Number of instances for each class label:
Label
joy         8240
surprise    3849
sadness     3830
fear        2816
anger       1555
disgust      761
Name: count, dtype: int64


In [None]:
# Step 1: Prepare the text data
tweets = df['Tweet'].values


In [None]:
print(tweets[:5])

['Thinks that @melbahughes had a great 50th birthday party :) '
 'Como una expresiÃ³n tan simple, una sola oraciÃ³n, puede llegara daÃ±arte tanto. '
 'the moment when you get another follower and you cheer. '
 'Be the greatest dancer of your life! practice daily positive habits.  #fun #freedom #habits'
 'eww.. my moms starting to make her annual rum cake for the whole ramdyal/ally family. fml fml fml the smelll....... ']


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping



# Step 2: Tokenize the text data and convert it into binary format (0s and 1s)



In [None]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(tweets)
binary_matrix = tokenizer.texts_to_matrix(tweets, mode='binary')

print("Binary Matrix shape:", binary_matrix.shape)  # Input to the model


Binary Matrix shape: (21051, 5000)


# Step 3: Prepare the label data
 Convert encoded labels to categorical (one-hot encoding)


In [None]:
labels = df['Label'].values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

categorical_labels = to_categorical(encoded_labels)

# Check shapes of categorical labels
print("Categorical Labels shape:", categorical_labels.shape)  # Output for the model


Categorical Labels shape: (21051, 6)


# Step 5: Split the data into training and testing sets (80% train, 20% test)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    binary_matrix, categorical_labels,
    test_size=0.2,
    stratify=encoded_labels,
    random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (16840, 5000)
X_test shape: (4211, 5000)
y_train shape: (16840, 6)
y_test shape: (4211, 6)




# Step 6: Build the Dense Neural Network



In [None]:


model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer
    Dense(32, activation='relu'),                                   # Hidden layer 1
    Dense(y_train.shape[1], activation='softmax')                 # Output layer
])

# Display the model summary
model.summary()


# Step 7: Compile the model



In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:

early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

 Train the model with the EarlyStopping callback
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/20
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.4297 - loss: 1.4701 - val_accuracy: 0.5603 - val_loss: 1.1866
Epoch 2/20
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6895 - loss: 0.8860 - val_accuracy: 0.5635 - val_loss: 1.1843
Epoch 3/20
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8029 - loss: 0.6032 - val_accuracy: 0.5594 - val_loss: 1.3061
Epoch 4/20
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.8763 - loss: 0.3983 - val_accuracy: 0.5600 - val_loss: 1.4952
Epoch 5/20
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9297 - loss: 0.2468 - val_accuracy: 0.5445 - val_loss: 1.7661


# Step 10: Evaluate the model on the test set


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")

train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
print(f"Training Accuracy: {train_accuracy:.4f}")


Test Accuracy: 0.5847
Training Accuracy: 0.7700


 Analyze class accuracy if needed



In [None]:
predictions = model.predict(X_test)
predicted_classes = predictions.argmax(axis=-1)
true_classes = y_test.argmax(axis=-1)

# Check accuracy per class
from sklearn.metrics import classification_report
report = classification_report(true_classes, predicted_classes, target_names=label_encoder.classes_)
print(report)


[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
              precision    recall  f1-score   support

       anger       0.50      0.33      0.40       311
     disgust       0.34      0.11      0.16       152
        fear       0.55      0.60      0.57       563
         joy       0.67      0.76      0.71      1649
     sadness       0.47      0.47      0.47       766
    surprise       0.56      0.51      0.53       770

    accuracy                           0.58      4211
   macro avg       0.52      0.46      0.47      4211
weighted avg       0.57      0.58      0.57      4211



Yes, the accuracy varies from class to class in the Tweet Emotions classification task.

### Reasons :

- **Class Imbalance:** Some emotions, like joy, have many more examples than others, like disgust. This makes the model better at predicting common emotions while it struggles with the less frequent ones.

- **Less Data for Certain Classes:** The model has fewer examples to learn from for emotions that don’t appear as often, which makes it harder for it to predict them correctly.

- **Context Sensitivity:** Tweets can show emotions in different ways depending on the situation. This variability makes it harder for the model to perform well across all emotions.



**TASK-02** BBC sports Dataset

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/datasets/bbcsports.csv')  # Adjust the path accordingly

# Display the first few rows to confirm it's loaded
print(df.head())



   Unnamed: 0                                               text   label
0           0  Sharapova overcomes tough Molik\n\nWimbledon c...  tennis
1           1  GB players warned over security\n\nBritain's D...  tennis
2           2  Federer wins title in Rotterdam\n\nWorld numbe...  tennis
3           3  Mauresmo fights back to win title\n\nWorld num...  tennis
4           4  Agassi into second round in Dubai\n\nFourth se...  tennis


In [None]:
# Check the column names in the DataFrame
print(df.columns)



Index(['Unnamed: 0', 'text', 'label'], dtype='object')


In [None]:
# Display the number of instances for each class in the 'label' column
class_counts = df['label'].value_counts()
print("Number of instances for each class label:")
print(class_counts)


Number of instances for each class label:
label
football     265
rugby        147
cricket      124
athletics    101
tennis       100
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Step 3: Prepare the text data
texts = df['text'].values  # Assuming the text column is named 'text'

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
binary_matrix = tokenizer.texts_to_matrix(texts, mode='binary')

# Prepare the label data
labels = df['label'].values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Convert encoded labels to categorical (one-hot encoding)
categorical_labels = to_categorical(encoded_labels)

# Display shapes to ensure everything is correct
print("Binary Matrix shape:", binary_matrix.shape)  # Input to the model
print("Categorical Labels shape:", categorical_labels.shape)  # Output for the model


Binary Matrix shape: (737, 5000)
Categorical Labels shape: (737, 5)


In [None]:
# Step 4: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    binary_matrix, categorical_labels,
    test_size=0.2,
    stratify=encoded_labels,
    random_state=42  # Ensure reproducibility
)

# Verify the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (589, 5000) (589, 5)
Testing set shape: (148, 5000) (148, 5)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# Step 5: Build the Dense Neural Network
model = Sequential([
    Input(shape=(X_train.shape[1],)),                             # Input layer
    Dense(64, activation='relu'),                                   # Hidden layer 1
    Dense(32, activation='relu'),                                   # Hidden layer 2
    Dense(y_train.shape[1], activation='softmax')                 # Output layer
])

# Step 6: Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Step 7: Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

# Step 8: Train the model with the EarlyStopping callback
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Display the training history
print("Training history:", history.history)


Epoch 1/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.4398 - loss: 1.4430 - val_accuracy: 0.6441 - val_loss: 0.9174
Epoch 2/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8917 - loss: 0.5674 - val_accuracy: 0.9576 - val_loss: 0.3950
Epoch 3/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9934 - loss: 0.1559 - val_accuracy: 0.9831 - val_loss: 0.1901
Epoch 4/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0380 - val_accuracy: 0.9746 - val_loss: 0.1400
Epoch 5/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0153 - val_accuracy: 0.9746 - val_loss: 0.1146
Epoch 6/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0087 - val_accuracy: 0.9746 - val_loss: 0.1001
Training history: {'accuracy': [0.520

In [None]:
# Step 9: Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Step 10: Report training accuracy using the training data
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
print(f"Training Accuracy: {train_accuracy:.4f}")


Test Accuracy: 0.9865
Training Accuracy: 0.9966


In [None]:
from sklearn.metrics import classification_report

# Step 11: Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)  # Get the index of the max probability

# Step 12: Generate a classification report
print("Classification Report:")
print(classification_report(y_test.argmax(axis=1), y_pred_classes))


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.96      0.98        25
           2       0.98      1.00      0.99        53
           3       1.00      1.00      1.00        30
           4       1.00      0.95      0.97        20

    accuracy                           0.99       148
   macro avg       0.99      0.98      0.98       148
weighted avg       0.99      0.99      0.99       148



Yes, the accuracy does vary a little from class to class in the BBC Sports classification task, but the differences are small.

**Reasons:**

- **Balanced Dataset:** The dataset has about the same number of examples for each class, which helps prevent bias toward any one category.

- **Distinct Categories:** The classes (athletics, cricket, football, rugby, tennis) are different from each other, making it easier for the model to tell them apart.

- **Low Ambiguity:** Sports categories are clearer and less confusing than emotions, so the model performs more consistently across the classes.

- **High Precision and Recall:** The model has high precision and recall for all classes, meaning it does a good job of identifying each category.

