# Installing the required Libraries

In [None]:
!pip install -U accelerate
!pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

#Import the required libraries

In [5]:
import pandas as pd
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# For Removing links, dollar signs before characters, and special characters in the text

In [6]:
def preprocess_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\$(?=[a-zA-Z])', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)
    return text

# Load the training Data

In [None]:
import pandas as pd
data_to_train = pd.read_csv('/content/sample_data/training_data.csv') #Change the path according to the dataset you want to train
data_to_train.head()

Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


#Preprocess the Data

In [None]:
data_to_train['text'] = data_to_train['text'].str.lower()
data_to_train['text'] =data_to_train['text'].apply(preprocess_text)

# Creating custom dataset for creating enecodings to feed the DistillBERTSequenceClassifier model

In [7]:
class CustomDataset(Dataset):
  def __init__(self,texts,labels,tokenizer, max_len = 512):
    self.texts=texts
    self.labels=labels
    self.tokenizer=tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])
    encoding = self.tokenizer(text,truncation=True, padding = "max_length",max_length=self.max_len)
    return {
        'input_ids':encoding['input_ids'],
        'attention_mask':encoding['attention_mask'],
        'labels':label

    }

In [8]:
checkpoint = 'distilbert-base-uncased'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint,num_labels=3).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Creating label2id and id2label according to our usecase

In [None]:
X = data_to_train['text'].tolist() #Convert to list for feeding the model

label2id  = {
    "Bearish":0,
    "Bullish":1,
    "Neutral":2
}

id2label ={
    0: "Bearish",
    1: "Bullish",
    2: "Neutral"
}

y = data_to_train['label'].tolist()

dataset = CustomDataset(X,y,tokenizer)

# Performing Train Test Split

In [None]:
training_dataset, testing_dataset = train_test_split(dataset,test_size=0.2,random_state = 42)

#Creating function for Computing Metrics for the Model

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metric(example):
  labels = example.label_ids
  preds = example.predictions.argmax(-1)
  f1 = f1_score(labels,preds,average='weighted')
  acc = accuracy_score(labels,preds)

  return {"accuracy":acc,"f1":f1}

#Here We define the batch_size, the model name for this model and arguments required for the training

In [None]:
batch_size = 16
model_name = "distilbert_finetuned_stock_sentiment"

args = TrainingArguments(
   output_dir = 'output',
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   learning_rate=2e-5,
   num_train_epochs=10,
   evaluation_strategy='epoch'
)

#Here we call use theh trainer to train the model

In [None]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=training_dataset,
                  eval_dataset=testing_dataset,
                  compute_metrics=compute_metric,
                  tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
trainer.save_model(model_name)

# Finding the predictions for the validation data

In [None]:
data_to_validate = pd.read_csv('/content/sample_data/MSFT.csv') #Change the path according to the dataset you want to validate


In [None]:
data_to_validate['text'] = data_to_validate['text'].str.lower()
data_to_validate['text'] =data_to_validate['text'].apply(preprocess_text)

In [None]:
data_to_validate.head()

Unnamed: 0,Date,text,code
0,5/5/2014,earnings alert microsoft q3 eps 0.61 vs. 0.51 ...,MSFT
1,5/2/2014,breaking microsoft earnings beat eps 0.61 vs 0...,MSFT
2,5/1/2014,breaking microsoft earnings beat eps 0.61 vs 0...,MSFT
3,4/30/2014,microsoft reports quarterly earnings 0.61share...,MSFT
4,4/29/2014,rt cnbcnow earnings alert microsoft q3 eps 0.6...,MSFT


#Predictions

In [None]:
def predict_sentiment(df, tokenizer, model):
    # Preprocess the text data
    texts = df['text'].tolist()
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the appropriate device
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

    # Predict sentiment labels
    with torch.no_grad():
        outputs = model(**inputs)

    # Map predicted label IDs to sentiment labels
    predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    predicted_sentiments = [id2label[label_id] for label_id in predicted_labels]

    # Add predicted sentiment labels to the DataFrame
    df['Sentiment'] = predicted_sentiments

    return df

# Assuming you have a DataFrame named 'data_to_predict' containing text data

# Run the pipeline


In [None]:
predictions_df = predict_sentiment(data_to_validate, tokenizer, model)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


# Save your predictions in CSV format

In [None]:

predicted_df_aapl.to_csv('predictions.csv', index=False)
