# NLP Challenge: Twitter Sentiment Analysis

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# from gensim.models import Word2Vec
# from gensim.test.utils import datapath
# from gensim import utils
# import gensim
import multiprocessing
cores = multiprocessing.cpu_count()
import os
from time import time
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader

In [2]:
# Import the machine learning model of your choice
from sklearn.linear_model import LogisticRegression
# Example: from sklearn.naive_bayes import MultinomialNB

In [None]:
# To notify when cell is complete
!pip install git+https://github.com/cphyc/jupyter-notify.git
%reload_ext jupyternotify

# Step 1: Load the Sentiment140 dataset

In [4]:
# Download the dataset from Kaggle and specify the file path
df = pd.read_csv('../training.1600000.processed.noemoticon.csv', names=['target', 'id', 'date', 'flag', 'user', 'tweet'])
df.head()

Unnamed: 0,target,id,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Step 2: Data Preprocessing

In [5]:
# Clean the text data, remove special characters, handle missing values, etc.
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
tweet     0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
df['processed'] = df.tweet.map(lambda x: preprocess(x))

In [8]:
df = df.drop(columns=['date', 'flag', 'user', 'tweet'])

In [10]:
df.tail()

Unnamed: 0,target,id,processed
1599995,4,2193601966,Just woke up. Having no school is the best fee...
1599996,4,2193601969,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,happy #charitytuesday @user @user @user


# Pulling BERT from huggingface
tokenizer = AutoTokenizer.from_pretrained("activebus/BERT_Review")
model = AutoModel.from_pretrained("activebus/BERT_Review")# Step 3: Feature Extraction

In [12]:
# Pulling BERT from huggingface
tokenizer = AutoTokenizer.from_pretrained("activebus/BERT_Review")
model = AutoModel.from_pretrained("activebus/BERT_Review")

Some weights of the model checkpoint at activebus/BERT_Review were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
# Function to vectorize text with BERT
def get_embedding(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    features = model(**encoded_input)
    features = features[0].detach().numpy() 
    features_mean = np.mean(features[0], axis=0) 
    return features_mean

In [None]:
# Applying function to dataframe
df['embeddings'] = df.processed.map(lambda x: get_embedding(x))
df.head(2)

In [None]:
# Converting embeddings to numpy array
vectors = np.array(df.embeddings.tolist(), dtype='float')
# Flattening numpy array and building dataframe
v_df = pd.DataFrame(vectors, columns=[f'col{i+1}' for i in range(vectors.shape[1])])
v_df.head(2)

In [None]:
# Joining the dataframes
combined = pd.concat([df, v_df], axis=1)
combined.head(1)

In [None]:
X = combined.drop(columns=['target', 'id'])
y = combined['target']

In [1]:
# Choose a feature extraction method (e.g., TF-IDF, Gensim, or a pretrained language model) and transform the text data into numerical features.
# tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
# X = tfidf_vectorizer.fit_transform(df['text'])
# y = df['target']
# YOUR CODE HERE!

# Step 4: Model Selection and Training

In [1]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Selecting test data with text column still in-place for later use
Ant_test = pd.concat([X_test, y_test])
X_train = X_train.drop(columns=['processed'])
X_test = X_test.drop(columns=['processed'])

In [None]:
# Choose a machine learning model (e.g., Logistic Regression) and train it
Regression = LogisticRegression(max_iter=1000)
Regression.fit(X_train, y_train)

In [None]:
print(f'Train score: , {Regression.score(X_train, y_train)}')
print(f'Test score: , {Regression.score(X_test, y_test)}')

# Step 5: Sentiment Analysis

In [1]:
# Perform sentiment analysis on the dataset using your trained model
y_pred = Regression.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Selecting only tweets containing 'Anthropic'
anthropic = Ant_test[Ant_test['processed'].str.contains('anthropic', case=False, na=False)]
anthropic.reset_index(drop=True, inplace=True)
anthropic.tail(3)

Using trained model to predict Anthropic tweets:

In [None]:
# Get embeddings
# anthropic['embeddings'] = anthropic.processed.map(lambda x: get_embedding(x))
# Convert embeddings to numpy array and dataframe
# a_vectors = np.array(anthropic.embeddings.tolist(), dtype='float')
# a_v_df = pd.DataFrame(a_vectors, columns=[f'col{i+1}' for i in range(a_vectors.shape[1])])
# Merge dataframes
# a_combined = pd.concat([anthropic, a_v_df], axis=1)
# Set X and y
a_X = anthropic.drop(columns=['target', 'processed'])
a_y = a_combined['target']
# Make prediction with model
a_y_pred = Regression.predict(a_X)
print(classification_report(a_y, a_y_pred))

# Step 6: Visualizations

In [2]:
# Select an AI company or product of your choice and collect tweets related to it
# Use your trained model to predict sentiment on these tweets
# Create visualizations to showcase sentiment (e.g., bar charts, word clouds)

# Example: 
# - Visualize sentiment distribution using seaborn or matplotlib.
# - Create word clouds for positive and negative tweets.
# - Generate a bar chart showing sentiment scores for the chosen company/product.

# Additional Tips:
# - Experiment with hyperparameter tuning to improve model performance.
# - Use cross-validation for a more robust evaluation.
# - Write functions to encapsulate repetitive tasks and improve code organization.

sns.set(rc = {'figure.figsize':(6,3)})
sns.histplot(data=a_y)
# plt.savefig('Histogram of sentiment on Anthropic.png')

In [None]:
sns.set(rc = {'figure.figsize':(6,3)})
sns.histplot(data=a_y_pred)

In [None]:
sns.set(rc = {'figure.figsize':(6,3)})
sns.histplot(data=y_test)

In [None]:
sns.set(rc = {'figure.figsize':(6,3)})
sns.histplot(data=y_train)

In [None]:
sns.set(rc = {'figure.figsize':(6,3)})
sns.histplot(data=y)

# Evaluation

In [1]:
# Evaluate your model's performance using metrics like accuracy, precision, recall, and F1-score.

print(f'accuracy: {accuracy_score(y_test, y_pred)}')
print(f' classification report: {classification_report(y_test, y_pred)}')
print(f' confusion matrix: {confusion_matrix(y_test, y_pred)}')

---