In [2]:
import pandas as pd
import re

In [4]:
data = pd.read_csv('/content/tweet_emotions.csv')

In [5]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [6]:
data.tail()

Unnamed: 0,tweet_id,sentiment,content
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...
39999,1753919049,love,@mopedronin bullet train from tokyo the gf ...


In [7]:
data.shape

(40000, 3)

In [8]:
data.info

In [9]:
data.isnull().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

In [10]:
data.describe(include = 'all')

Unnamed: 0,tweet_id,sentiment,content
count,40000.0,40000,40000
unique,,13,39827
top,,neutral,I just received a mothers day card from my lov...
freq,,8638,14
mean,1845184000.0,,
std,118857900.0,,
min,1693956000.0,,
25%,1751431000.0,,
50%,1855443000.0,,
75%,1962781000.0,,


In [11]:
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from concurrent.futures import ProcessPoolExecutor
import torch

In [12]:
emotions = ['worry', 'happiness', 'love', 'sadness', 'hate']

In [13]:
sentiments = ['positive', 'negative']

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [15]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Get mean of embeddings
    return embeddings.detach().numpy().flatten()

In [16]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)

    text = re.sub(r'[^a-zA-Z\s]', '', text)

    text = ' '.join(text.split())

    return text

In [17]:
data['cleaned_content'] = data['content'].apply(clean_text)

In [18]:
data.head()

Unnamed: 0,tweet_id,sentiment,content,cleaned_content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue i know i was listenin to bad habit ...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,Layin n bed with a headache ughhhhwaitin on yo...
2,1956967696,sadness,Funeral ceremony...gloomy friday...,Funeral ceremonygloomy friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,wants to hang out with friends SOON
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,dannycastillo We want to trade with someone wh...


In [None]:
data['bert_embeddings'] = data['cleaned_content'].apply(get_bert_embeddings)

In [None]:
data.head()

In [None]:
import numpy as np

In [None]:
X_train, X_test, y_train_sentiment, y_test_sentiment = train_test_split(
    data['bert_embeddings'].tolist(), data['sentiment'].tolist(), test_size=0.2, random_state=42
)
y_train_sentiment = np.reshape(y_train_sentiment, (-1, 1))
y_test_sentiment = np.reshape(y_test_sentiment, (-1, 1))

In [None]:
svm_sentiment = SVC(kernel='linear')

classifier_sentiment = MultiOutputClassifier(svm_sentiment)

classifier_sentiment.fit(X_train, y_train_sentiment)

In [None]:
accuracy_sentiment = classifier_sentiment.score(X_test, y_test_sentiment)

In [None]:
print(accuracy_sentiment)

In [None]:
def preprocess_real_time_data(text):
    cleaned_text = clean_text(text)

    embeddings = get_bert_embeddings(cleaned_text)

    return embeddings

In [None]:
real_time_text = "I'm really very sad"
preprocessed_data = preprocess_real_time_data(real_time_text)

In [None]:
predicted_sentiment = classifier_sentiment.predict([preprocessed_data])[0]

In [None]:
sentiments = ['worry', 'happiness', 'love', 'sadness', 'hate']

In [None]:
predicted_sentiment_index = sentiments.index(predicted_sentiment)

predicted_sentiment_label = sentiments[predicted_sentiment_index]

print("Predicted Sentiment:", predicted_sentiment_label)

In [None]:
import matplotlib.pyplot as plt

In [None]:
labels = ['worry', 'happiness', 'love', 'sadness', 'hate']

predicted_sentiment_str = str(predicted_sentiment)

predicted_sentiment_label = predicted_sentiment_str[2:-2]

if predicted_sentiment_label in labels:
    sizes = [1 if label == predicted_sentiment_label else 0 for label in labels]
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['orange', 'lightgreen', 'pink', 'lightblue', 'red'])
    plt.axis('equal')
    plt.title('Predicted Sentiment')
    plt.show()
else:
    print("Predicted sentiment label not found in the list of labels.")


In [None]:
# Define a function to estimate word-level sentiment percentages
def word_sentiment_percentages(model, text):
    # Split the text into words
    words = text.split()

    # Initialize a dictionary to store the importance scores for each word
    word_importance_scores = {word: 0 for word in words}

    # Calculate the baseline decision scores (without any perturbation)
    baseline_scores = model.decision_function([get_bert_embeddings(text)])[0]

    # Softmax function to convert decision scores to probabilities
    def softmax(x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0)

    baseline_probs = softmax(baseline_scores)

    # Perturb each word and observe the change in decision score
    for word in words:
        # Clone the original text and replace the word with an empty string
        perturbed_text = text.replace(word, '')

        # Calculate the decision scores after perturbation
        perturbed_scores = model.decision_function([get_bert_embeddings(perturbed_text)])[0]

        # Softmax transformation to obtain probabilities
        perturbed_probs = softmax(perturbed_scores)

        # Estimate the importance score of the word based on the change in probabilities
        word_importance_scores[word] = np.abs(baseline_probs - perturbed_probs)

    # Sum up the importance scores across all classes
    total_importance = sum(sum(importance) for importance in word_importance_scores.values())

    # Normalize the importance scores to sum up to 1
    word_sentiment_percentages = {word: (sum(importance) / total_importance) * 100 for word, importance in word_importance_scores.items()}

    return word_sentiment_percentages

# Assuming you have a trained MultiOutputClassifier 'classifier_sentiment' with BERT embeddings
# Assuming you have a function 'get_bert_embeddings' to obtain BERT embeddings for a given text
# Assuming 'real_time_text' is your real-time text input

# Calculate word-level sentiment percentages for the real-time text
sentiment_percentages = word_sentiment_percentages(classifier_sentiment.estimators_[0], real_time_text)

# Print the word-level sentiment percentages
print("Word-Level Sentiment Percentages:")
for word, percentage in sentiment_percentages.items():
    print(f"{word.capitalize()}: {percentage:.2f}%")


In [None]:
# Convert the sentiment percentages dictionary to lists for plotting
labels = list(sentiment_percentages.keys())
sizes = list(sentiment_percentages.values())

# Plot the pie chart
plt.figure(figsize=(5,5))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Word-Level Sentiment Percentages')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()