In [1]:
import pandas as pd

# Import CSV from the path stored in variable 'path'
path = "stock_dataset.csv"
df = pd.read_csv(path)

# Check for duplicate rows
duplicates = df.duplicated()

# Check total number of rows and columns
print(f"Total rows: {df.shape[0]}, Total columns: {df.shape[1]}")
# Print the number of duplicate rows

print(f"Number of duplicate rows: {duplicates.sum()}")

print(df['label'].value_counts())

Total rows: 1312, Total columns: 2
Number of duplicate rows: 2
label
0.0    789
1.0    522
Name: count, dtype: int64


In [3]:

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer
from transformers import TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Prepare data
train_df, val_df = train_test_split(df.dropna(subset=['label']), test_size=0.2, random_state=42, stratify=df['label'].dropna())

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.encodings = tokenizer(
            dataframe['text'].tolist(),
            truncation=True,
            padding=True,
            max_length=max_len
        )
        self.labels = dataframe['label'].astype(int).tolist()
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_df, tokenizer)
val_dataset = TextDataset(val_df, tokenizer)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=10,
    disable_tqdm=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.5755, 'grad_norm': 8.977652549743652, 'learning_rate': 4.656488549618321e-05, 'epoch': 0.07633587786259542}
{'loss': 0.2623, 'grad_norm': 0.9497588276863098, 'learning_rate': 4.2748091603053435e-05, 'epoch': 0.15267175572519084}
{'loss': 0.1016, 'grad_norm': 0.4006009101867676, 'learning_rate': 3.8931297709923666e-05, 'epoch': 0.22900763358778625}
{'loss': 0.0921, 'grad_norm': 17.418718338012695, 'learning_rate': 3.511450381679389e-05, 'epoch': 0.3053435114503817}
{'loss': 0.0161, 'grad_norm': 0.12344273924827576, 'learning_rate': 3.129770992366413e-05, 'epoch': 0.3816793893129771}
{'loss': 0.0065, 'grad_norm': 0.10540562123060226, 'learning_rate': 2.7480916030534355e-05, 'epoch': 0.4580152671755725}
{'loss': 0.0254, 'grad_norm': 0.09454644471406937, 'learning_rate': 2.3664122137404583e-05, 'epoch': 0.5343511450381679}
{'loss': 0.0042, 'grad_norm': 0.24802830815315247, 'learning_rate': 1.984732824427481e-05, 'epoch': 0.6106870229007634}
{'loss': 0.0083, 'grad_norm': 19.07642

TrainOutput(global_step=131, training_loss=0.0846335306134473, metrics={'train_runtime': 68.2276, 'train_samples_per_second': 15.36, 'train_steps_per_second': 1.92, 'train_loss': 0.0846335306134473, 'epoch': 1.0})

In [8]:
raw_text = "Tell me about the profit margin of Apple in the last quarter."
inputs = tokenizer(raw_text, return_tensors='pt', truncation=True, padding=True)
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(dim=1).item()
print(f"Predicted class: {predicted_class}")

Predicted class: 1
