In [6]:
# Import libraries
import pandas as pd
import numpy as np
import re
import os
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from scipy.special import softmax
from bertopic import BERTopic

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load the dataset
data = pd.read_csv('train.csv')

In [None]:
# print data head
data.head()

In [None]:
# Preprocessing the dataset

In [None]:
# Define preprocessing function for tweets

def preprocess_tweet(tweet):
    """Preprocess a tweet by normalizing various elements."""
    # Replace Twitter handles with '@user'
    tweet = re.sub(r'@\w+', '@user', tweet)
    
    # Replace URLs with 'http'
    tweet = re.sub(r'http\S+', 'http', tweet) 
    return tweet

# Apply preprocessing to each tweet in the DataFrame
data['clean_tweet'] = data['tweet'].apply(preprocess_tweet)
data.head()

In [None]:
# Load sentiment analysis model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

In [None]:
# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)  

In [None]:
# Apply BERTopic

# Set environment variable for tokenizers to avoid parallelism issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Use 'clean_tweet' for BERTopic and sentiment analysis
docs = data['clean_tweet'].tolist()

# Create and apply BERTopic model
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

# Fit the model
topics, probs = topic_model.fit_transform(docs)

# Explore the topics
topic_info = topic_model.get_topic_info()

# Add the topics to the DataFrame
data['topic'] = topics

In [None]:
# Function to get sentiment scores
def get_sentiment_scores(tweet):
    encoded_tweet = tokenizer(tweet, return_tensors='pt', max_length=512, truncation=True)
    encoded_tweet = {k: v.to(device) for k, v in encoded_tweet.items()}
    with torch.no_grad():
        output = model(**encoded_tweet)
    scores = output[0][0].cpu().detach().numpy()
    return softmax(scores)

# Apply sentiment analysis and add scores to DataFrame
data[['negative', 'neutral', 'positive']] = data['clean_tweet'].apply(lambda x: pd.Series(get_sentiment_scores(x)))

In [None]:
# Save the DataFrame with topics and sentiment analysis to a new CSV file
data.to_csv('tweet_with_topics_sentiment.csv', index=False)

In [None]:
# Exploratory Data Analysis

In [None]:
topic_model.get_topic_info()

In [None]:
# Print the top 10 topics
topic_info_df = topic_model.get_topic_info()
top_10_topics = topic_info_df.head(11)  # The first topic (-1) is the outlier topic
print(top_10_topics)

In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.get_representative_docs(1)

In [None]:
df = pd.DataFrame({"topic": topics, "document": docs})
df

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.get_document_info(docs)