In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = '/content/drive/My Drive/data/data_part_1.parquet'

# Load the dataset
data = pd.read_parquet(file_path)

# Preview the dataset
data.head()

In [None]:
data.shape

## Load FinBERT Model and Tokenizer
We'll use Hugging Face's transformers package to load a pre-trained FinBERT model and tokenizer.

##Load distilBERT Model and Tokenizer

In [None]:
def setup_model_distil():
    # Load the tokenizer and model for DistilBERT
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

    # Set up the sentiment analysis pipeline
    sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    return sentiment_pipeline

##Choose model

In [None]:
nlp = setup_model_distil()

In [None]:

sentiment = nlp(data.head(10)['MixedTEXT'].to_list(),truncation=True, max_length=512)

df_sentiment = pd.DataFrame(sentiment)
df_sentiment.rename(columns={'label': 'distil_sentiment', 'score': 'distil_sentiment confidence'}, inplace=True)
data = pd.concat([data, df_sentiment], axis=1)


In [None]:
data.head(10)

In [None]:
data.to_parquet('/content/drive/My Drive/data/finbert_btc.parquet')