In [1]:
# !pip install datasets

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

In [3]:
# Load your financial sentiment dataset
stockemo_df = pd.read_csv("/content/processed_stockemo.csv")

In [4]:
stockemo_df

Unnamed: 0,id,date,ticker,senti_label,original,processed,industry
0,0,2020-08-31,AAPL,Bullish,$AAPL you better get back to $130 again 🤔,AAPL you better get back to again thinking face,Information Technology
1,1,2020-08-31,AAPL,Bullish,$AAPL now we just wait for power hour 😎💪🏾,AAPL now we just wait for power hour smiling f...,Information Technology
2,2,2020-08-31,AAPL,Bullish,$AAPL never thought I'd buy this 💰,AAPL never thought I d buy this money bag,Information Technology
3,3,2020-08-31,AAPL,Bullish,$AAPL bought my first option ever with Apple f...,AAPL bought my first option ever with Apple fo...,Information Technology
4,4,2020-08-31,AAPL,Bullish,"$AAPL Buy low, hold for another split in a cou...",AAPL Buy low hold for another split in a coupl...,Information Technology
...,...,...,...,...,...,...,...
50276,50276,2020-10-08,LOW,Bullish,$LOW Why isn’t this trending yet 🤔,LOW Why isn t this trending yet thinking face,Consumer Discretionary
50277,50277,2020-10-08,LOW,Bullish,$LOW Show us some strength and push over $170‼️,LOW Show us some strength and push over double...,Consumer Discretionary
50278,50278,2020-11-11,LOW,Bullish,$LOW retail investors will keep adding until ...,LOW retail investors will keep adding until ea...,Consumer Discretionary
50279,50279,2020-12-09,LOW,Bullish,$LOW in 10mins Squeeze Zone possible morning s...,LOW in mins Squeeze Zone possible morning spik...,Consumer Discretionary


In [5]:
# Map sentiment labels to numerical values (e.g., Bullish: 1, Bearish: 0)
sentiment_mapping = {"Bullish": 1, "Bearish": 0}
stockemo_df['senti_label'] = stockemo_df['senti_label'].map(sentiment_mapping)


In [6]:
stockemo_df

Unnamed: 0,id,date,ticker,senti_label,original,processed,industry
0,0,2020-08-31,AAPL,1,$AAPL you better get back to $130 again 🤔,AAPL you better get back to again thinking face,Information Technology
1,1,2020-08-31,AAPL,1,$AAPL now we just wait for power hour 😎💪🏾,AAPL now we just wait for power hour smiling f...,Information Technology
2,2,2020-08-31,AAPL,1,$AAPL never thought I'd buy this 💰,AAPL never thought I d buy this money bag,Information Technology
3,3,2020-08-31,AAPL,1,$AAPL bought my first option ever with Apple f...,AAPL bought my first option ever with Apple fo...,Information Technology
4,4,2020-08-31,AAPL,1,"$AAPL Buy low, hold for another split in a cou...",AAPL Buy low hold for another split in a coupl...,Information Technology
...,...,...,...,...,...,...,...
50276,50276,2020-10-08,LOW,1,$LOW Why isn’t this trending yet 🤔,LOW Why isn t this trending yet thinking face,Consumer Discretionary
50277,50277,2020-10-08,LOW,1,$LOW Show us some strength and push over $170‼️,LOW Show us some strength and push over double...,Consumer Discretionary
50278,50278,2020-11-11,LOW,1,$LOW retail investors will keep adding until ...,LOW retail investors will keep adding until ea...,Consumer Discretionary
50279,50279,2020-12-09,LOW,1,$LOW in 10mins Squeeze Zone possible morning s...,LOW in mins Squeeze Zone possible morning spik...,Consumer Discretionary


In [7]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(stockemo_df, test_size=0.2, random_state=42)


In [8]:
train_df

Unnamed: 0,id,date,ticker,senti_label,original,processed,industry
36657,36657,2020-09-22,TSLA,1,$TSLA Turning into Utility Company as well..👍👍👍👍,TSLA Turning into Utility Company as well thum...,Consumer Discretionary
8891,8891,2020-08-31,AAPL,1,$AAPL I’m not buying more! I like my avg. if i...,AAPL I m not buying more I like my avg if it h...,Information Technology
41496,41496,2020-12-10,TSLA,0,$TSLA I might pick this one up around 90. Anot...,TSLA I might pick this one up around Another o...,Consumer Discretionary
8249,8249,2020-08-24,AAPL,1,$AAPL Up $1000 already on 4 contracts let’s ke...,AAPL Up already on contracts let s keep this c...,Information Technology
22089,22089,2020-01-30,PYPL,1,$PYPL That dip was pun intended 😄,PYPL That dip was pun intended grinning face w...,Information Technology
...,...,...,...,...,...,...,...
11284,11284,2020-02-05,BA,1,$BA 🤗🛫 DGAF buy time mofossssss certify and FL...,BA smiling face with open hands airplane depar...,Industrials
44732,44732,2020-01-23,TSLA,1,$TSLA I can feel and I can touch it and I know...,TSLA I can feel and I can touch it and I know ...,Consumer Discretionary
38158,38158,2020-10-08,TSLA,1,$TSLA so far futures looking good 🤘🏻💀🤘🏻,TSLA so far futures looking good sign of the h...,Consumer Discretionary
860,860,2020-09-14,AAPL,1,$AAPL Thank god I loaded up the calls last Fri...,AAPL Thank god I loaded up the calls last Frid...,Information Technology


In [9]:
val_df

Unnamed: 0,id,date,ticker,senti_label,original,processed,industry
8718,8718,2020-08-30,AAPL,1,$AAPL what is the opening price on Monday👁,AAPL what is the opening price on Monday eye,Information Technology
46385,46385,2020-02-06,TSLA,0,"$TSLA tomorrow will reach $1,000 🤪😜🤪😜😜😂😂😂😂😂",TSLA tomorrow will reach zany face winking fac...,Consumer Discretionary
36747,36747,2020-09-22,TSLA,1,$TSLA \n🚨 🚨🚨🚨🚨🚨🚨🚨\nAll you thought you can pul...,TSLA police car light police car light police ...,Consumer Discretionary
41288,41288,2020-12-09,TSLA,1,$TSLA at this point short sellers opinions ar...,TSLA at this point short sellers opinions are ...,Consumer Discretionary
3431,3431,2020-10-29,AAPL,0,$AAPL Gonna be such a big rug pull at earnings...,AAPL Gonna be such a big rug pull at earnings ...,Information Technology
...,...,...,...,...,...,...,...
21373,21373,2020-12-22,MSFT,1,$MSFT ...Currently has a Buy rating | ⭐ 4.0 (3...,MSFT Currently has a Buy rating star reviews u...,Information Technology
19902,19902,2020-08-11,JPM,1,$JPM all we need now is congress to agree on a...,JPM all we need now is congress to agree on a ...,Financials
35583,35583,2020-09-16,TSLA,1,$TSLA haha like always. As soon as u buy more ...,TSLA haha like always As soon as u buy more it...,Consumer Discretionary
18729,18729,2020-12-03,JNJ,1,$JNJ jnj is up next step aside moderna in a fe...,JNJ jnj is up next step aside moderna in a few...,Health Care


In [10]:
# Define a function to tokenize the text data using RoBERTa tokenizer
def tokenize_data(data):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    return data.apply(lambda row: tokenizer(row["processed"], padding=True, truncation=True), axis=1)

train_tokenized = tokenize_data(train_df)
val_tokenized = tokenize_data(val_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
train_tokenized

36657    [input_ids, attention_mask]
8891     [input_ids, attention_mask]
41496    [input_ids, attention_mask]
8249     [input_ids, attention_mask]
22089    [input_ids, attention_mask]
                    ...             
11284    [input_ids, attention_mask]
44732    [input_ids, attention_mask]
38158    [input_ids, attention_mask]
860      [input_ids, attention_mask]
15795    [input_ids, attention_mask]
Length: 40224, dtype: object

In [12]:
val_tokenized

8718     [input_ids, attention_mask]
46385    [input_ids, attention_mask]
36747    [input_ids, attention_mask]
41288    [input_ids, attention_mask]
3431     [input_ids, attention_mask]
                    ...             
21373    [input_ids, attention_mask]
19902    [input_ids, attention_mask]
35583    [input_ids, attention_mask]
18729    [input_ids, attention_mask]
5579     [input_ids, attention_mask]
Length: 10057, dtype: object

In [13]:
# Convert the tokenized data to a dictionary with string values
train_dict = {k: [tuple(x)[i] for x in train_tokenized] for k, i in zip(["input_ids", "attention_mask"], [0, 1])}
val_dict = {k: [tuple(x)[i] for x in val_tokenized] for k, i in zip(["input_ids", "attention_mask"], [0, 1])}

# Add the sentiment labels to the dictionaries
train_dict["labels"] = train_df["senti_label"].tolist()
val_dict["labels"] = val_df["senti_label"].tolist()

# Convert the dictionaries to Dataset objects
train_dataset = Dataset.from_dict(train_dict)
val_dataset = Dataset.from_dict(val_dict)

In [14]:
# Define the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [15]:
# # Prepare the input data for the model
# def preprocess_function(examples):
#     # Tokenize the input text
#     inputs = tokenizer(examples["processed"], truncation=True, padding=True)

#     # Convert the input data to a dictionary
#     inputs["input_ids"] = inputs["input_ids"].pop("input_ids")
#     inputs["attention_mask"] = inputs["input_ids"].pop("attention_mask")
#     inputs["labels"] = examples["labels"]

#     return inputs

# train_dataset = train_dataset.map(preprocess_function, batched=True)
# val_dataset = val_dataset.map(preprocess_function, batched=True)

In [16]:
# !pip install transformers[torch]

In [17]:
# !pip install accelerate

In [18]:
# !pip install transformers

In [19]:
# !pip uninstall torch
# !pip uninstall transformers
!pip install torch
# !pip install transformers

[0m

In [20]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [21]:
# Instantiate the RoBERTa model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [23]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10057
})

In [24]:
# Fine-tune the model
trainer.train()

ValueError: You have to specify either input_ids or inputs_embeds

In [None]:
# After trainer.train(), you can evaluate the fine-tuned model on the validation set
trainer.evaluate()

In [None]:
# Now, you can use this fine-tuned RoBERTa model to make predictions on new financial text data
# Define a function to predict sentiment using RoBERTa
def predict_sentiment(text):
    tokenizer = RobertaTokenizer.from_pretrained(model_path)
    model = RobertaForSequenceClassification.from_pretrained(model_path)
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    predicted_probabilities = outputs.logits.softmax(dim=-1)
    predicted_class = predicted_probabilities.argmax().item()
    return predicted_class

In [None]:
# Example usage
text_example = "AAPL stock is expected to rise significantly in the coming months."
predicted_sentiment = predict_sentiment(text_example)
if predicted_sentiment == 1:
    print("Bullish sentiment")
else:
    print("Bearish sentiment")