# NLP Final Project
Classifying tweets related to $AAPL as Bullish or Bearish
## Contents
1. Imports
2. Data Processing
3. Feature Extraction
4. Fine tuning
5. Specialised Model



# 1. Imports

In [None]:
# If running on Google Colab run here
# Tenserflow2.12 is required for the final pipeline implementation.
import os

IS_ON_COLAB = bool(os.getenv("COLAB_RELEASE_TAG"))

if IS_ON_COLAB:
    !pip install transformers==4.28.0
    !pip install tokenizers datasets sentencepiece huggingface_hub[cli] accelerate

pip install tensorflow==2.12

In [None]:
# Import Commands
# Standard Libraries
from time import time

# External Libraries
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import torch

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
)

# Datasets
from datasets import Dataset, DatasetDict

logging.set_verbosity_error()

# 2. Data Processing
- Load dataset from GitHub Repo
- Clean dataset 

In [None]:
# This dataset was created by concatenating a Kaggle dataset which can be found here https://www.kaggle.com/datasets/frankcaoyun/stocktwits-2020-2022-raw
# Please replace AAPL_2021.csv with the directory it is saved in
AAPL_2021 = pd.read_csv("/Users/alecbennett/Downloads/AAPL_2021.csv")

In [None]:
# Define which columns are of interest
selected_columns = ['body', 'entities', 'created_at']

# Create a new DataFrame with only the selected columns
AAPL_2021 = AAPL_2021[selected_columns]

In [None]:
# Define a function to clean up the sentiment values
def clean_sentiment(value):
    try:
        value_dict = eval(value)  # Safely evaluate the string as a dictionary
        if value_dict['sentiment'] is None:
            return 'N/A'
        elif 'basic' in value_dict['sentiment']:
            if value_dict['sentiment']['basic'] == 'Bullish':
                return 'Bullish'
            elif value_dict['sentiment']['basic'] == 'Bearish':
                return 'Bearish'
        return 'N/A'
    except (SyntaxError, KeyError):
        return 'N/A'

# Apply the function to the 'entities' column
AAPL_2021['entities'] = AAPL_2021['entities'].apply(clean_sentiment)

In [None]:
# renaming the columns
new_column_names = {'body':'text',
                    'entities':'label'}
AAPL_2021.rename(columns=new_column_names, inplace=True)

In [None]:
# Limiting to only the labelled data 
label_AAPL_2021 = AAPL_2021[AAPL_2021['label'].isin(['Bullish', 'Bearish'])].copy()
label_AAPL_2021['label']=label_AAPL_2021['label'].replace({'Bullish':1,'Bearish':0})

# 3. Feature Extraction
1. By making use of the user initiated sentiment data as labels we can create a fully labelled dataset
2. Tokenise the dataset
3. Extract the last hidden state
4. Run a logisitic model to create predictions on the test dataset
5. Assess the quality of the model

If you would like to utilise the entire dataset please do not run the below cell which limits it to 4000 datapoints. If using entire dataset, results may differ to the ones I have in the Results.md file

In [None]:
# Due to processing power, I have opted to limit the sample. In order to try train the model, I am creating an even split of Bullish and Bearish labels.
class_0_df = label_AAPL_2021[label_AAPL_2021['label'] == 0]
class_1_df = label_AAPL_2021[label_AAPL_2021['label'] == 1]

# Randomly sample 2000 rows from each class
size = 2000
sampled_class_0 = class_0_df.sample(n=size, random_state=42)
sampled_class_1 = class_1_df.sample(n=size, random_state=42)

# Combine the sampled dataframes to create the balanced dataset
label_AAPL_2021 = pd.concat([sampled_class_0, sampled_class_1], ignore_index=True)
label_AAPL_2021['label'].value_counts()


In [None]:
# Splitting into test train split
train, test = train_test_split(label_AAPL_2021, test_size=0.2, random_state=42)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_AAPL = Dataset.from_pandas(train)
test_AAPL = Dataset.from_pandas(test)

label_AAPL_2021 = DatasetDict({'train': train_AAPL, 'test': test_AAPL})

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
if torch.cuda.is_available():
    # Move the model to the GPU
    model = model.to("cuda")

In [None]:
# Tokenise the entire dataset
max_length=322
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
    return tokenizer(batch["text"], padding='max_length', truncation=True)

# Use the map function to tokenize the 'text' column
AAPL_2021_encoded = label_AAPL_2021.map(tokenize, batched=True)

In [None]:
model = AutoModel.from_pretrained(model_name)

### Extracting the entire Last Hidden State

In [None]:
def extract_states(batch, model):
    pass

In [None]:
def extract_states(batch, model):  # noqa: F811
    input_ids = torch.tensor(batch["input_ids"])
    attention_mask = torch.tensor(batch["attention_mask"])

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        lhs = output.last_hidden_state.cpu().numpy()

    valid = np.array(batch["attention_mask"]).astype(bool)

    batch_size, n_tokens, hidden_dim = lhs.shape
    valid = valid.reshape(batch_size, n_tokens, 1).repeat(hidden_dim, axis=-1)

    masked_mean = np.ma.array(lhs, mask=~valid).mean(axis=1).data

    batch["hidden_state"] = masked_mean
    return batch

In [None]:
last_states = AAPL_2021_encoded.map(
    extract_states,
    batched=True,
    batch_size=256,
    fn_kwargs={"model": model},
)

### Use the LHS in sklearn
1. Split into arrays
2. Run a logit regression including penalties
3. Measure performance using `sklearn.metrics`


In [None]:
X_train = np.array(last_states["train"]["hidden_state"])
X_test = np.array(last_states["test"]["hidden_state"])
y_train = np.array(last_states["train"]["label"])
y_test = np.array(last_states["test"]["label"])

In [None]:
# Running with different hyperparameters to find the model with the best performance
penalties = [ 'l2', 'none']
C_values = [ 0.1, 1.0, 10]
best_accuracy = 0.0
best_penalty = None
best_C = None

for penalty in penalties:
    for C in C_values:
        # Create and train the logistic regression model with the current penalty and C value
        logit = LogisticRegression(penalty=penalty, C=C, random_state=42)
        logit.fit(X_train, y_train)

        # Evaluate the model's accuracy on the test set
        accuracy = logit.score(X_test, y_test)

        # Check if the current combination of penalty and C yielded a higher accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_penalty = penalty
            best_C = C

print(f"Best Penalty: {best_penalty}")
print(f"Best C: {best_C}")
print(f"Highest Accuracy: {best_accuracy}")

In [None]:
logit = LogisticRegression(penalty = 'l2', C=1)
logit.fit(X_train, y_train)

#Accuracy Score
acc = logit.score(X_test, y_test)
print(f"Accuracy Score: {acc}")
y_pred = logit.predict(X_test)

In [None]:
# F1 score 
f1_score(y_test, y_pred, average=None)

In [None]:
# Using a classification report to assess performance of the feature extraction
report = classification_report(
    y_test,
    y_pred,
    target_names=['Bullish', "Bearish"],
)
print(report)

In [None]:
# Confusion matrix to inspect false positives and false negatives
confusion = confusion_matrix(y_test, y_pred, normalize="true")
confusion = pd.DataFrame(
    confusion, columns=['Bullish', "Bearish"], index=['Bullish', "Bearish"]
)
confusion

# 4. Fine Tuning DistilBERT
In order to finetune a pretrained model to the classification problem here, the below steps are taken.
1. Label the data with binary classification
2. Split the data into a train, test, validation split
3. Tokenise the data
4. Train the model using trainer API
5. Test the quality of the model
6. Analyse new data with the model

In [None]:
# Limiting to only the labelled data 
label_AAPL_2021 = AAPL_2021[AAPL_2021['label'].isin(['Bullish', 'Bearish'])].copy()
label_AAPL_2021['label']=label_AAPL_2021['label'].replace({'Bullish':1,'Bearish':0})

In [None]:
# Do not run if you want to use entire dataset
# Due to processing power, I have opted to limit the sample. In order to try train the model, I am creating an even split of Bullish and Bearish labels.
class_0_df = label_AAPL_2021[label_AAPL_2021['label'] == 0]
class_1_df = label_AAPL_2021[label_AAPL_2021['label'] == 1]
# Randomly sample 2000 rows from each class
size = 2000
sampled_class_0 = class_0_df.sample(n=size, random_state=42)
sampled_class_1 = class_1_df.sample(n=size, random_state=42)

# Combine the sampled dataframes to create the balanced dataset
label_AAPL_2021 = pd.concat([sampled_class_0, sampled_class_1], ignore_index=True)
label_AAPL_2021['label'].value_counts()

In [None]:
# Split into train and temp (which will be split further into validation and final test)
train, temp = train_test_split(label_AAPL_2021, test_size=0.2, random_state=42)

# Further split temp into validation and final test
validation, test = train_test_split(temp, test_size=0.5, random_state=42)

# Reset the indices for all three splits
train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# Convert to Datasets
train_AAPL = Dataset.from_pandas(train)
validation_AAPL = Dataset.from_pandas(validation)
test_AAPL = Dataset.from_pandas(test)

# Create a DatasetDict with train, validation, and test datasets
label_AAPL_2021 = DatasetDict({
    'train': train_AAPL,
    'validation': validation_AAPL,
    'test': test_AAPL
})

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
#num_labels set to 2 as there are two classes: bullish and bearish
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
# Tokenise
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

# Use the map function to tokenize the 'body' column
AAPL_2021_encoded = label_AAPL_2021.map(tokenize, batched=True, batch_size=None)
AAPL_2021_encoded.set_format('torch',
                             columns=['input_ids', 'attention_mask','label'],
                             )
AAPL_2021_encoded

In [None]:
device="cuda:0" if torch.cuda.is_available() else None
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
).to(device)

In [None]:
def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
batch_size = 32
logging_steps = len(AAPL_2021_encoded["train"]) // batch_size

training_args = TrainingArguments(
    output_dir="results",
    optim="adamw_torch",
    per_device_train_batch_size=batch_size,
    num_train_epochs=4,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=AAPL_2021_encoded["train"],
    eval_dataset=AAPL_2021_encoded["validation"],
)

In [None]:
# Implement the trainer API
# Output contains metrics to measure quality
trainer.train()

In [None]:
# Feeding the pre-trained model new unseen data
custom_text = "I feel very confident now and can't wait to buy more"
input_tensor = tokenizer.encode(custom_text, return_tensors="pt").to(device)
with torch.no_grad():
    logits = model(input_tensor).logits.cpu()
probs = scipy.special.softmax(logits.flatten())
labels = ["Bullish", "Bearish"]
pd.Series(probs, index=labels).plot.barh()

# 5. Using a finetuned model Stock-Sentiment-Bert
By implementing a finetuned model for a similar usecase, this should give a benchmark of how well a different model appplies to the data here
1. Setting up the Pipeline
2. Implementing the Classifier on the Test dataset
3. Testing the quality of the model



In [None]:
# Using a pretrained model uploaded to HuggingFace which was pretrained for a similar usecase
# Run succesfuly on 2023-09-09; however, may have been deprecated as there were some issues with it on 2023-09-10
classifier = pipeline(
    "text-classification",
    model='tarnformnet/Stock-Sentiment-Bert',
    device = 'cuda:0' if torch.cuda.is_available() else None,
    )

In [None]:
# Convert the HuggingFace Dataset into pandas in order for classification to read it
test_AAPL_2021 = label_AAPL_2021['test'].to_pandas()

In [None]:
# Running the classifier
def get_sentiment(text):
    result = classifier(text)
    sentiment_label = result[0]['label']
    sentiment_score = result[0]['score']
    return sentiment_label, sentiment_score

# Apply the sentiment analysis function to each tweet in the dataframe
sentiment_labels = []
sentiment_scores = []

for text in test_AAPL_2021['text']:
    label, score = get_sentiment(text)
    sentiment_labels.append(label)
    sentiment_scores.append(score)

# Append the sentiment label and score lists as new columns to the dataframe
test_AAPL_2021['sentiment_label'] = sentiment_labels
test_AAPL_2021['sentiment_score'] = sentiment_scores

In [None]:
# Convert back into strings so that they can be compared to model predictions
test_AAPL_2021['label']=test_AAPL_2021['label'].replace({1:'Bullish',0:'Bearish'})

In [None]:
# Calculating metrics in order to compare performance to previous models
accuracy = accuracy_score(test_AAPL_2021['label'], test_AAPL_2021['sentiment_label'])
f1 = f1_score(test_AAPL_2021['label'], test_AAPL_2021['sentiment_label'], average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')

In [None]:
# Let's count the number of tweets by sentiments
sentiment_counts = test_AAPL_2021['sentiment_label'].value_counts()
print(sentiment_counts)
label_counts = test_AAPL_2021['label'].value_counts()
print(label_counts)

# Visualize the sentiments
fig = plt.figure(figsize=(6,6), dpi=100)
ax = plt.subplot(111)
sentiment_counts.plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="")
plt.title("Model Classification")

# Visualise the labels
fig = plt.figure(figsize=(6,6), dpi=100)
ax = plt.subplot(111)
label_counts.plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="")
plt.title("True Classification")