In [1]:
!pip install --upgrade accelerate datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (f

In [1]:
 # Pytorch Deep Learning
import torch
# Pandas+Numpy
import numpy as np
import pandas as pd
# Sklearn metrics
from sklearn.metrics import balanced_accuracy_score,accuracy_score

# Hugging Face Transformer Libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline,Trainer, TrainingArguments
# Hugging Face Datasets
from datasets import Dataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("/content/tsla_sentimentfinal.csv")
df.head()

Unnamed: 0,time_published,title,url,authors,summary,banner_image,source,category_within_source,source_domain,topics,overall_sentiment_score,overall_sentiment_label,ticker_sentiment,ticker_relevance_TSLA,ticker_sentiment_TSLA,num_tickers,detailed_original_label,label
0,20230129T061700,US investigates self-driving claims made by El...,https://www.business-standard.com/article/inte...,['IANS'],US investigates self-driving claims made by El...,https://bsmedia.business-standard.com/_media/b...,Business Standard,GoogleRSS,www.business-standard.com,"[{'topic': 'Financial Markets', 'relevance_sco...",0.052691,Neutral,"[{'ticker': 'TSLA', 'relevance_score': '0.8749...",0.874993,0.145067,1,Neutral,Neutral
1,20230129T114500,Why 2023 Is Tesla's Year To Prove Itself,https://www.fool.com/investing/2023/01/29/tesl...,['Travis Hoium and Jason Hall'],Tesla's fourth-quarter report showed both the ...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.3...",0.07328,Neutral,"[{'ticker': 'TSLA', 'relevance_score': '0.9850...",0.985002,0.175709,1,Somewhat_Bullish,Bullish
2,20230129T124300,Elon Musk Finally Reveals Sales Increase Follo...,https://www.fool.com/investing/2023/01/29/elon...,"['CFA', 'Parkev Tatevosian']",There was much speculation among investors on ...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Manufacturing', 'relevance_score':...",0.143396,Neutral,"[{'ticker': 'TSLA', 'relevance_score': '0.9470...",0.947069,0.36792,1,Bullish,Bullish
3,20230129T140857,Tesla 'spontaneously' catches fire on Californ...,https://www.foxbusiness.com/technology/tesla-s...,[],"A Tesla Model S ""spontaneously"" burst into fla...",https://static.foxbusiness.com/foxbusiness.com...,Fox Business News,,www.foxbusiness.com,"[{'topic': 'Manufacturing', 'relevance_score':...",-0.183315,Somewhat-Bearish,"[{'ticker': 'TSLA', 'relevance_score': '0.5742...",0.574235,0.164179,1,Somewhat_Bullish,Bullish
4,20230129T211841,Would You Sell Your House For Tesla Stock? Thi...,https://www.benzinga.com/news/23/01/30615938/w...,['Chris Katje'],Last year marked Tesla Inc's TSLA worst stock ...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,News,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.8...",0.246707,Somewhat-Bullish,"[{'ticker': 'TSLA', 'relevance_score': '0.9737...",0.973732,0.434486,1,Bullish,Bullish


In [4]:
# Convert time_published to datetime format
df['time_published'] = pd.to_datetime(df['time_published'], format='%Y%m%dT%H%M%S')


# Sort by time
df = df.sort_values('time_published')

# Define sentiment label mapping function
def _get_label_sentiment(x):
    if x <= -0.35:
        return 'Bearish', 'Bearish'
    elif -0.35 < x <= -0.15:
        return 'Somewhat-Bearish', 'Bearish'
    elif -0.15 < x < 0.15:
        return 'Neutral', 'Neutral'
    elif 0.15 <= x < 0.35:
        return 'Somewhat_Bullish', 'Bullish'
    else:  # x >= 0.35
        return 'Bullish', 'Bullish'

# Apply sentiment classification
df[['Original_Sentiment_Label_Text', 'Sentiment_Label_Text']] = df['ticker_sentiment_TSLA'].apply(
    lambda x: pd.Series(_get_label_sentiment(x))
)

# Add exponentially weighted moving average of ticker sentiment
df['sentiment_ewm'] = df['ticker_sentiment_TSLA'].ewm(span=10, adjust=False).mean()

# Map final sentiment labels to integers
# 1 = Negative, 2 = Neutral, 3 = Positive
label_map = {
    'Bearish': 1,
    'Neutral': 2,
    'Bullish': 3
}
df['Label'] = df['Sentiment_Label_Text'].map(label_map)

# Convert num_tickers to float if needed
df['num_tickers'] = df['num_tickers'].astype(float)

# Reorder columns to match target format
final_cols = [
    'title', 'url', 'time_published', 'authors', 'summary', 'banner_image',
    'source', 'category_within_source', 'source_domain', 'topics',
    'overall_sentiment_score', 'overall_sentiment_label', 'ticker_sentiment',
    'ticker_relevance_TSLA', 'ticker_sentiment_TSLA', 'num_tickers',
    'sentiment_ewm', 'Original_Sentiment_Label_Text',
    'Sentiment_Label_Text', 'Label'
]

df_final = df[final_cols]

# Save to new CSV file
df_final.to_csv("tsla_sentiment_final.csv", index=False)

print("Dataset transformed and saved as tsla_sentiment_final.csv")


Dataset transformed and saved as tsla_sentiment_final.csv


In [5]:
df_final.head(5)

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,overall_sentiment_score,overall_sentiment_label,ticker_sentiment,ticker_relevance_TSLA,ticker_sentiment_TSLA,num_tickers,sentiment_ewm,Original_Sentiment_Label_Text,Sentiment_Label_Text,Label
0,US investigates self-driving claims made by El...,https://www.business-standard.com/article/inte...,2023-01-29 06:17:00,['IANS'],US investigates self-driving claims made by El...,https://bsmedia.business-standard.com/_media/b...,Business Standard,GoogleRSS,www.business-standard.com,"[{'topic': 'Financial Markets', 'relevance_sco...",0.052691,Neutral,"[{'ticker': 'TSLA', 'relevance_score': '0.8749...",0.874993,0.145067,1.0,0.145067,Neutral,Neutral,2
1,Why 2023 Is Tesla's Year To Prove Itself,https://www.fool.com/investing/2023/01/29/tesl...,2023-01-29 11:45:00,['Travis Hoium and Jason Hall'],Tesla's fourth-quarter report showed both the ...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.3...",0.07328,Neutral,"[{'ticker': 'TSLA', 'relevance_score': '0.9850...",0.985002,0.175709,1.0,0.150638,Somewhat_Bullish,Bullish,3
2,Elon Musk Finally Reveals Sales Increase Follo...,https://www.fool.com/investing/2023/01/29/elon...,2023-01-29 12:43:00,"['CFA', 'Parkev Tatevosian']",There was much speculation among investors on ...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Manufacturing', 'relevance_score':...",0.143396,Neutral,"[{'ticker': 'TSLA', 'relevance_score': '0.9470...",0.947069,0.36792,1.0,0.190144,Bullish,Bullish,3
3,Tesla 'spontaneously' catches fire on Californ...,https://www.foxbusiness.com/technology/tesla-s...,2023-01-29 14:08:57,[],"A Tesla Model S ""spontaneously"" burst into fla...",https://static.foxbusiness.com/foxbusiness.com...,Fox Business News,,www.foxbusiness.com,"[{'topic': 'Manufacturing', 'relevance_score':...",-0.183315,Somewhat-Bearish,"[{'ticker': 'TSLA', 'relevance_score': '0.5742...",0.574235,0.164179,1.0,0.185423,Somewhat_Bullish,Bullish,3
4,Would You Sell Your House For Tesla Stock? Thi...,https://www.benzinga.com/news/23/01/30615938/w...,2023-01-29 21:18:41,['Chris Katje'],Last year marked Tesla Inc's TSLA worst stock ...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,News,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.8...",0.246707,Somewhat-Bullish,"[{'ticker': 'TSLA', 'relevance_score': '0.9737...",0.973732,0.434486,1.0,0.230707,Bullish,Bullish,3


In [6]:
from sklearn.utils import resample

# Desired number of rows
target_size = 500

# Calculate number of samples per class (roughly balanced)
num_classes = df_final['Sentiment_Label_Text'].nunique()
samples_per_class = target_size // num_classes

# Stratified sampling: randomly sample from each class
df_sampled = (
    df_final.groupby('Sentiment_Label_Text', group_keys=False)
    .apply(lambda x: x.sample(n=min(len(x), samples_per_class), random_state=42))
    .reset_index(drop=True)
)

# If there's leftover quota due to rounding or class size, sample extra rows randomly
leftover = target_size - len(df_sampled)
if leftover > 0:
    remaining = df_final.drop(df_sampled.index)
    df_extra = remaining.sample(n=leftover, random_state=42)
    df_sampled = pd.concat([df_sampled, df_extra]).reset_index(drop=True)

# Shuffle the final DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Done! Check class distribution
print(df_sampled['Sentiment_Label_Text'].value_counts())
print(f"Final shape: {df_sampled.shape}")


Sentiment_Label_Text
Neutral    168
Bearish    166
Bullish    166
Name: count, dtype: int64
Final shape: (500, 20)


  .apply(lambda x: x.sample(n=min(len(x), samples_per_class), random_state=42))


In [7]:
df_final = df_sampled

In [8]:
df_final.rename({"summary":"text"},axis=1,inplace=True)

In [9]:
df_final[['Original_Sentiment_Label_Text','Sentiment_Label_Text']].value_counts(normalize=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
Original_Sentiment_Label_Text,Sentiment_Label_Text,Unnamed: 2_level_1
Neutral,Neutral,0.336
Somewhat-Bearish,Bearish,0.22
Somewhat_Bullish,Bullish,0.206
Bullish,Bullish,0.126
Bearish,Bearish,0.112


In [10]:
df_final['time_published'].min(),df['time_published'].max()

(Timestamp('2023-01-31 16:18:45'), Timestamp('2025-03-21 14:09:03'))

### MANUAL APPROACH


In [11]:
# Model name from Model Hub
model_name = 'yiyanghkust/finbert-tone'
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [12]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "yiyanghkust/finbert-tone",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Neutral",
    "1": "Positive",
    "2": "Negative"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Negative": 2,
    "Neutral": 0,
    "Positive": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30873
}

In [13]:
id_2_label = model.config.id2label
id_2_label

{0: 'Neutral', 1: 'Positive', 2: 'Negative'}

In [14]:
sentence = "The market outlook is very positive thanks to the new economic policies."

inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

inputs

{'input_ids': tensor([[   3,    6,   52,  954,   17,  190,  483, 1237,    9,    6,   56,  289,
          693,   48,    4]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
# To make all tensors on same device
inputs = {k: v.to(device) for k, v in inputs.items()}

In [16]:
with torch.no_grad():
    outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-7.4354, 12.1102, -6.3745]]), hidden_states=None, attentions=None)

In [17]:
predictions = np.argmax(outputs.logits.cpu().numpy(), axis=1)

predictions

array([1])

Map this class to its meaning

0: Positive

1: Negative

2: Neutral

In [18]:
 # Map the array elements using a list comprehension
mapped_array = [id_2_label[element] for element in predictions]

print(mapped_array)

['Positive']


### TRANSFORMER PIPELINE APPROACH

In [19]:
# Model name from Model Hub
model_name = 'yiyanghkust/finbert-tone'

sentiment_pipeline = pipeline(task="sentiment-analysis", model=model_name,batch_size=128,device=device)

Device set to use cpu


In [20]:
sentiment_pipeline(sentence)

[{'label': 'Positive', 'score': 1.0}]

In [21]:
sentence = "The market outlook is negative thanks to the new economic policies."

In [22]:
sentiment_pipeline(sentence)

[{'label': 'Negative', 'score': 0.9999996423721313}]

In [23]:
preds = sentiment_pipeline(df_final['text'].tolist())

In [24]:
preds[0:20]

[{'label': 'Neutral', 'score': 0.9999996423721313},
 {'label': 'Negative', 'score': 0.8154864311218262},
 {'label': 'Negative', 'score': 0.9998575448989868},
 {'label': 'Neutral', 'score': 0.9999549388885498},
 {'label': 'Negative', 'score': 0.9998363256454468},
 {'label': 'Negative', 'score': 0.9999653100967407},
 {'label': 'Neutral', 'score': 0.9991375207901001},
 {'label': 'Positive', 'score': 1.0},
 {'label': 'Negative', 'score': 0.9995935559272766},
 {'label': 'Neutral', 'score': 0.999977707862854},
 {'label': 'Neutral', 'score': 0.9999475479125977},
 {'label': 'Neutral', 'score': 0.5540375113487244},
 {'label': 'Neutral', 'score': 0.9955099821090698},
 {'label': 'Negative', 'score': 0.7210379838943481},
 {'label': 'Neutral', 'score': 0.9994350075721741},
 {'label': 'Negative', 'score': 0.9970982074737549},
 {'label': 'Neutral', 'score': 0.9964435696601868},
 {'label': 'Neutral', 'score': 0.8603602647781372},
 {'label': 'Neutral', 'score': 0.9973063468933105},
 {'label': 'Positive

In [25]:
df_final['prediction']=[pred['label'] for pred in preds]

In [26]:
df_final.groupby(['Original_Sentiment_Label_Text','prediction']).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Original_Sentiment_Label_Text,prediction,Unnamed: 2_level_1
Bearish,Negative,17
Bearish,Neutral,35
Bearish,Positive,4
Bullish,Negative,3
Bullish,Neutral,27
Bullish,Positive,33
Neutral,Negative,43
Neutral,Neutral,104
Neutral,Positive,21
Somewhat-Bearish,Negative,41


In [27]:
df_final.groupby(['Sentiment_Label_Text','prediction']).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Sentiment_Label_Text,prediction,Unnamed: 2_level_1
Bearish,Negative,58
Bearish,Neutral,97
Bearish,Positive,11
Bullish,Negative,19
Bullish,Neutral,86
Bullish,Positive,61
Neutral,Negative,43
Neutral,Neutral,104
Neutral,Positive,21


In [28]:
print(df_final['Sentiment_Label_Text'].unique())
print(df_final['prediction'].unique())


['Neutral' 'Bearish' 'Bullish']
['Neutral' 'Negative' 'Positive']


In [29]:
label_map = {
    'Positive': 'Bullish',
    'Negative': 'Bearish',
    'Neutral': 'Neutral'
}

df_final.loc[:, 'prediction_mapped'] = df_final['prediction'].map(label_map)


In [30]:
print(df_final.columns)


Index(['title', 'url', 'time_published', 'authors', 'text', 'banner_image',
       'source', 'category_within_source', 'source_domain', 'topics',
       'overall_sentiment_score', 'overall_sentiment_label',
       'ticker_sentiment', 'ticker_relevance_TSLA', 'ticker_sentiment_TSLA',
       'num_tickers', 'sentiment_ewm', 'Original_Sentiment_Label_Text',
       'Sentiment_Label_Text', 'Label', 'prediction', 'prediction_mapped'],
      dtype='object')


In [31]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(
    df_final['Sentiment_Label_Text'],
    df_final['prediction_mapped']
))

# Balanced Accuracy Score
print("Balanced Accuracy:", balanced_accuracy_score(
    df_final['Sentiment_Label_Text'],
    df_final['prediction_mapped']
))

              precision    recall  f1-score   support

     Bearish       0.48      0.35      0.41       166
     Bullish       0.66      0.37      0.47       166
     Neutral       0.36      0.62      0.46       168

    accuracy                           0.45       500
   macro avg       0.50      0.45      0.44       500
weighted avg       0.50      0.45      0.44       500

Balanced Accuracy: 0.44530502964237906


In [33]:
model.config.label2id

{'Positive': 1, 'Negative': 2, 'Neutral': 0}

In [34]:
label_map = {
    'Bullish': 'Positive',
    'Bearish': 'Negative',
    'Neutral': 'Neutral'
}

# Map to model-compatible label names
df_final['model_label'] = df_final['Sentiment_Label_Text'].map(label_map)



In [35]:
df_final['label'] = df_final['model_label'].map(model.config.label2id)

In [36]:
df_final['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,168
2,166
1,166


In [37]:
df_final

Unnamed: 0,title,url,time_published,authors,text,banner_image,source,category_within_source,source_domain,topics,...,ticker_sentiment_TSLA,num_tickers,sentiment_ewm,Original_Sentiment_Label_Text,Sentiment_Label_Text,Label,prediction,prediction_mapped,model_label,label
0,Elon Musk meeting on PM Modi's US agenda as In...,https://www.business-standard.com/india-news/e...,2023-06-20 17:51:23,['Business Standard'],"The meeting, which will potentially include mo...",https://bsmedia.business-standard.com/_media/b...,Business Standard,GoogleRSS,www.business-standard.com,"[{'topic': 'Financial Markets', 'relevance_sco...",...,0.014138,1.0,0.158386,Neutral,Neutral,2,Neutral,Neutral,Neutral,0
1,Tesla To Provide Over-The-Air Software Update ...,https://www.benzinga.com/news/24/02/37280599/t...,2024-02-23 10:12:41,['Anan Ashraf'],EV giant Tesla Inc TSLA will issue a software ...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,News,www.benzinga.com,"[{'topic': 'Manufacturing', 'relevance_score':...",...,-0.199118,1.0,0.029789,Somewhat-Bearish,Bearish,1,Negative,Bearish,Negative,2
2,Tesla Slides On Q3 Deliveries: What The Number...,https://www.benzinga.com/news/23/10/35031133/t...,2023-10-02 13:13:52,['Adam Eckert'],Tesla Inc TSLA shares are trading lower on Mon...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,News,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.6...",...,0.085196,1.0,0.139840,Neutral,Neutral,2,Negative,Bearish,Neutral,0
3,World War 2-Era 550 Pound Bomb Near Tesla's Ge...,https://www.benzinga.com/news/24/06/39544540/w...,2024-06-28 10:48:35,['Anan Ashraf'],A 550-pound bomb from World War 2 has reported...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,News,www.benzinga.com,"[{'topic': 'Manufacturing', 'relevance_score':...",...,-0.299802,1.0,-0.028498,Somewhat-Bearish,Bearish,1,Neutral,Neutral,Negative,2
4,Blood all over! Tesla robot rips engineer's ba...,https://www.financialexpress.com/life/technolo...,2023-12-28 07:27:53,['Zerneela Mohammed Wakil'],A Tesla engineer faced a perilous encounter wi...,https://www.financialexpress.com/wp-content/up...,The Financial Express,,www.financialexpress.com,"[{'topic': 'Manufacturing', 'relevance_score':...",...,-0.354768,1.0,-0.067574,Bearish,Bearish,1,Negative,Bearish,Negative,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Tesla CEO Elon Musk Denies Human Rights Abuses...,https://www.benzinga.com/news/24/04/38452299/t...,2024-04-26 06:49:34,['Anan Ashraf'],Tesla Inc CEO Elon Musk on Thursday denied all...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,News,www.benzinga.com,"[{'topic': 'Manufacturing', 'relevance_score':...",...,-0.213837,1.0,-0.001885,Somewhat-Bearish,Bearish,1,Negative,Bearish,Negative,2
496,Buy Points Get A Historic Change. How Tesla's ...,https://www.investors.com/how-to-invest/invest...,2023-06-07 13:50:00,"[""Investor's Business Daily"", 'VIDYA RAMAKRISH...",Buy Points Get A Historic Change. How Tesla's ...,https://www.investors.com/wp-content/uploads/2...,Investors Business Daily,,www.investors.com,"[{'topic': 'Financial Markets', 'relevance_sco...",...,0.369176,1.0,0.224211,Bullish,Bullish,3,Positive,Bullish,Positive,1
497,Tesla Stock Faces Selling Pressure Ahead Of Q3...,https://www.benzinga.com/markets/esg/24/10/414...,2024-10-23 13:22:54,['Surbhi Jain'],With Tesla Inc's TSLA third-quarter earnings s...,https://editorial-assets.benzinga.com/wp-conte...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.7...",...,0.148080,1.0,0.189542,Neutral,Neutral,2,Negative,Bearish,Neutral,0
498,"Forget Tesla, Buy This Magnificent Auto Stock ...",https://www.fool.com/investing/2024/03/05/forg...,2024-03-05 15:03:00,['Neil Patel'],High-end luxury has a place in the automotive ...,https://g.foolcdn.com/editorial/images/767668/...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.8...",...,0.131223,1.0,-0.014933,Neutral,Neutral,2,Neutral,Neutral,Neutral,0


In [38]:
train_end_point = int(df_final.shape[0]*0.6)
val_end_point = int(df_final.shape[0]*0.8)
df_final_train = df_final.iloc[:train_end_point,:]
df_final_val = df_final.iloc[train_end_point:val_end_point,:]
df_final_test = df_final.iloc[val_end_point:,:]
print(df_final_train.shape, df_final_test.shape, df_final_val.shape)

(300, 24) (100, 24) (100, 24)


In [44]:
print("y_true classes:", df_final_test['Sentiment_Label_Text'].unique())
print("y_pred classes:", df_final_test['prediction_mapped'].unique())

y_true classes: ['Bullish' 'Neutral' 'Bearish']
y_pred classes: ['Positive' 'Negative' 'Neutral']


In [48]:
# Step 1: Run predictions
preds = sentiment_pipeline(df_final_test['text'].tolist())

# Step 2: Map model predictions to your dataset’s label scheme
label_map = {
    "Positive": "Bullish",
    "Negative": "Bearish",
    "Neutral": "Neutral"
}
df_final_test.loc[:, 'prediction'] = [pred['label'] for pred in preds]
df_final_test.loc[:, 'prediction_mapped'] = df_final_test['prediction'].map(label_map)

# Step 3: Check for any unmapped predictions
unmapped = df_final_test[df_final_test['prediction_mapped'].isna()]
if not unmapped.empty:
    print("Unmapped predictions found:\n", unmapped['prediction'].value_counts())

# Step 4: Compute accuracy
from sklearn.metrics import balanced_accuracy_score
acc = balanced_accuracy_score(
    df_final_test['Sentiment_Label_Text'],
    df_final_test['prediction_mapped']
)
print("Balanced Accuracy:", acc)

Balanced Accuracy: 0.4814337664236244


In [51]:
accuracy_score(df_final_test['Sentiment_Label_Text'],df_final_test['prediction_mapped'])

0.48

In [52]:
print(df_final['Sentiment_Label_Text'].unique())
print(df_final['label'].value_counts())


['Neutral' 'Bearish' 'Bullish']
label
0    168
2    166
1    166
Name: count, dtype: int64


In [54]:
label_map = {'Bearish': 0, 'Bullish': 1, 'Neutral': 2}
df_final['label'] = df_final['Sentiment_Label_Text'].map(label_map)

# Now split into train, val, test as df_train, df_val, df_test

In [56]:
# Converting pandas DataFrames into Hugging Face Dataset objects:
dataset_train = Dataset.from_pandas(df_final_train)
dataset_val = Dataset.from_pandas(df_final_val)
dataset_test = Dataset.from_pandas(df_final_test)

# Tokenizing the datasets:
dataset_train = dataset_train.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length' , max_length=128), batched=True)

# Setting the dataset format: (needed for Pytorch?)
dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])


# Shuffle the training dataset
dataset_train_shuffled = dataset_train.shuffle(seed=42)  # Using a seed for reproducibility

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [57]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

args = TrainingArguments(
    output_dir='temp/',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy="steps",  # Log every X steps
    logging_steps=50,  # Log every 50 steps
    learning_rate=2e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='balanced_accuracy',
)

trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=dataset_train_shuffled,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)




In [58]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mghewdeaarsh[0m ([33mghewdeaarsh-stevens-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,No log,3.978826,0.476296,0.44
2,No log,3.647978,0.43987,0.43
3,No log,3.552585,0.43987,0.43


TrainOutput(global_step=30, training_loss=3.7351615905761717, metrics={'train_runtime': 1773.0578, 'train_samples_per_second': 0.508, 'train_steps_per_second': 0.017, 'total_flos': 59200518988800.0, 'train_loss': 3.7351615905761717, 'epoch': 3.0})

In [59]:
predictions = trainer.predict(dataset_test)
predictions

PredictionOutput(predictions=array([[-3.1494660e+00,  5.6402507e+00, -3.8260262e+00],
       [-4.5389485e+00,  9.5338554e+00, -6.9017477e+00],
       [-3.8018987e+00, -2.3777041e+00,  9.8093452e+00],
       [ 2.3845239e+00, -5.3509814e-01, -3.1499381e+00],
       [ 7.6283865e+00, -5.7152948e+00, -3.0533316e+00],
       [ 1.2466038e+00, -1.6914306e+00,  6.3963354e-01],
       [-4.7914681e+00,  9.6140604e+00, -5.5916681e+00],
       [ 7.5624771e+00, -5.0763760e+00, -3.3669524e+00],
       [ 1.9521462e+00, -5.2061648e+00,  3.8958936e+00],
       [-5.5853930e+00,  1.0329281e+01, -6.7049646e+00],
       [-1.4201190e+00, -2.6235056e+00,  5.6848211e+00],
       [-2.2372942e-01, -2.5734792e+00,  1.5114694e+00],
       [ 1.2114265e+00,  2.8441443e+00, -5.1778784e+00],
       [-2.1624269e+00, -4.9972254e-01,  3.1472592e+00],
       [-4.0207119e+00,  4.3232636e+00, -2.6918738e+00],
       [ 5.3043413e+00, -5.3913336e+00, -2.8390343e+00],
       [ 5.1365185e+00, -1.8682963e+00, -4.6091743e+00],
  

In [60]:
model_path = "path/to/save/model"


# Save the model
trainer.model.save_pretrained(model_path)

# Save the tokenizer associated with the model
# Save the tokenizer
tokenizer.save_pretrained(model_path)

('path/to/save/model/tokenizer_config.json',
 'path/to/save/model/special_tokens_map.json',
 'path/to/save/model/vocab.txt',
 'path/to/save/model/added_tokens.json',
 'path/to/save/model/tokenizer.json')

In [61]:
trained_pipeline = pipeline("text-classification", model=model_path, tokenizer=model_path,device=device)

Device set to use cpu


In [63]:
preds = trained_pipeline(df_final_test['text'].tolist())
df_final_test.loc[:, 'prediction_mapped'] = [pred['label'] for pred in preds]

In [69]:
# Map model predictions to dataset labels
label_map = {
    "Positive": "Bullish",
    "Negative": "Bearish",
    "Neutral": "Neutral"
}

# Map predictions safely
df_final_test.loc[:, 'prediction_mapped'] = df_final_test['prediction'].map(label_map)

# Calculate score correctly (no labels param needed here)
score = balanced_accuracy_score(
    df_final_test['Sentiment_Label_Text'],
    df_final_test['prediction_mapped']
)
print(f"Balanced Accuracy Score: {score}")


Balanced Accuracy Score: 0.4814337664236244


In [73]:
score = accuracy_score(df_final_test['Sentiment_Label_Text'], df_final_test['prediction_mapped'])
print(f"Accuracy Score: {score}")

Accuracy Score: 0.48
