In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from google.colab import files
import io

uploaded = files.upload()
for file_name in uploaded.keys():
    raw_data = pd.read_csv(io.BytesIO(uploaded[file_name]), delimiter=';')

    print(f"Data from file: {file_name}")
    print(raw_data.head())

Saving Raw Data.csv to Raw Data.csv
Data from file: Raw Data.csv
                                             Comment  Emotion
0  a boyfriend with whom i split up with came ove...    anger
1  a certain friend tried to push me off a seat i...    anger
2         a father of children killed in an accident  sadness
3                                   a few monthe ago    anger
4  a friend of mine suggested that i become a fil...      joy


In [3]:
data = pd.read_csv('/content/Raw Data.csv', delimiter=';')
data.head(3)

Unnamed: 0,Comment,Emotion
0,a boyfriend with whom i split up with came ove...,anger
1,a certain friend tried to push me off a seat i...,anger
2,a father of children killed in an accident,sadness


**EDA**

In [4]:
print(data['Emotion'].unique())

['anger' 'sadness' 'joy' 'fear' 'anxious' 'love']


**Preprocessing Text Data**

Cleaning the text data by removing unnecessary characters, converting to lowercase, and tokenizing.

In [5]:
import re

def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()               # Convert to lowercase
    text = text.strip()               # Remove leading and trailing spaces
    return text

data['Comment'] = data['Comment'].apply(clean_text)

**Split the Data**


In [12]:
X = data['Comment']
y = data['Emotion']

# First, split into training and temp (which will be further split)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Now split the temp data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.6, random_state=42)

print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)
print("Test data shape:", X_test.shape)


Training data shape: (14000,)
Validation data shape: (2400,)
Test data shape: (3600,)


**Convert Text to Numeric Data**

Use TF-IDF (Term Frequency-Inverse Document Frequency)

In [13]:
vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

**Train the Naive Bayes Model**

Initialize and fit the Multinomial Naive Bayes model.

In [14]:
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

**Evaluate and Predict the Model**

Validation Data

In [19]:
y_val_pred = model.predict(X_val_vectorized)

print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred)}')
print(classification_report(y_val, y_val_pred))

Validation Accuracy: 0.6391666666666667
              precision    recall  f1-score   support

       anger       0.94      0.20      0.32       332
     anxious       0.00      0.00      0.00        95
        fear       0.92      0.12      0.21       287
         joy       0.61      0.98      0.75       844
        love       1.00      0.04      0.07       183
     sadness       0.65      0.91      0.76       659

    accuracy                           0.64      2400
   macro avg       0.69      0.37      0.35      2400
weighted avg       0.71      0.64      0.55      2400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Data

In [18]:
y_test_pred = model.predict(X_test_vectorized)

print(f'Test Accuracy: {accuracy_score(y_test, y_test_pred)}')
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.6397222222222222
              precision    recall  f1-score   support

       anger       0.96      0.14      0.25       462
     anxious       0.00      0.00      0.00       132
        fear       0.95      0.15      0.26       415
         joy       0.59      0.98      0.74      1213
        love       1.00      0.01      0.03       312
     sadness       0.68      0.92      0.78      1066

    accuracy                           0.64      3600
   macro avg       0.70      0.37      0.34      3600
weighted avg       0.72      0.64      0.54      3600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Predicting New Comments**

To predict the emotion of a new comment, preprocess it and use the model.

In [20]:
new_comment = "Whats wrong with you!"
cleaned_comment = clean_text(new_comment)
vectorized_comment = vectorizer.transform([cleaned_comment])
predicted_emotion = model.predict(vectorized_comment)

print(f'The predicted emotion is: {predicted_emotion[0]}')

The predicted emotion is: joy


**Exporting the model**

Save the Model and Vectorizer

After training your model, you can save both the model and the vectorizer to disk

In [42]:
import joblib

# Save the model
joblib.dump(model, 'emotion_classifier_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

**Analyze new dataset**

In [43]:
from google.colab import files
import io

uploaded = files.upload()
for file_name in uploaded.keys():
    test_data = pd.read_csv(io.BytesIO(uploaded[file_name]), delimiter=',')

Saving Test Data.csv to Test Data.csv


In [44]:
new_data = pd.read_csv('/content/Test Data.csv', delimiter=',')
new_data.head(3)

Unnamed: 0,Comment
0,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong


In [45]:
import joblib

# Load the model and vectorizer
model = joblib.load('emotion_classifier_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')


# Clean the comments in the new dataset
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = text.strip()
    return text

new_data['Comment'] = new_data['Comment'].apply(clean_text)

# Transform the comments using the loaded vectorizer
new_comments_vectorized = vectorizer.transform(new_data['Comment'])

# Make predictions
predicted_emotions = model.predict(new_comments_vectorized)

# Get the probabilities for each class
predicted_probabilities = model.predict_proba(new_comments_vectorized)

# Combine comments with predicted emotions and probabilities
results = pd.DataFrame({
    'Comment': new_data['Comment'],
    'Predicted Emotion': predicted_emotions,
    'Probability': predicted_probabilities.max(axis=1)  # Get the highest probability for each prediction
})

print(results)


                                                Comment Predicted Emotion  \
0                               i didnt feel humiliated             anger   
1     i can go from feeling so hopeless to so damned...             anger   
2      im grabbing a minute to post i feel greedy wrong             anger   
3     i am ever feeling nostalgic about the fireplac...               joy   
4                                  i am feeling grouchy             anger   
...                                                 ...               ...   
1203  i write which is what i consider my real profe...               joy   
1204  i feel honored to be witness to another s process               joy   
1205  i had a horrible tragedy something that i was ...              fear   
1206  i feel stupid and incapable and i dont know wh...             anger   
1207  i am writing this at a time when i have also h...             anger   

      Probability  
0        0.426026  
1        0.342433  
2        0.6884

**Export the results**

In [46]:
# Save results to a CSV file
results.to_csv('predicted_emotions.csv', index=False)

# Download the CSV file
from google.colab import files
files.download('predicted_emotions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Using BERT modelling**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Step 3: Load and Prepare Your Data

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(data['Comment'], data['Emotion'], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 4: Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='tf')

train_encodings = tokenize_data(X_train)
val_encodings = tokenize_data(X_val)
test_encodings = tokenize_data(X_test)

# Step 5: Convert Labels to TensorFlow Format
label_map = {label: idx for idx, label in enumerate(y_train.unique())}
y_train_numeric = y_train.map(label_map)
y_val_numeric = y_val.map(label_map)
y_test_numeric = y_test.map(label_map)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train_numeric)).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_val_numeric)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test_numeric)).batch(16)

# Step 6: Model Training
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Step 7: Evaluate the Model
y_pred = model.predict(test_dataset)
y_pred_labels = tf.argmax(y_pred.logits, axis=1)

# Calculate accuracy and classification report
print(f'Test Accuracy: {accuracy_score(y_test_numeric, y_pred_labels)}')
print(classification_report(y_test_numeric, y_pred_labels, target_names=label_map.keys()))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3