In [17]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import torch
import hopsworks


## Functions

In [31]:
def create_sentiment_csv(file_path, file_name='sentiment.csv'):
    sentiment_map = {"negative": 0, "positive": 1, "neutral": 2}
    data = []

    with open(file_path, 'r', encoding="latin1") as file:
        for line in file:
            sentence, sentiment = line.split("@")
            sentiment = sentiment.strip()  # remove any trailing whitespace
            data.append([sentence, sentiment_map[sentiment]])

    df = pd.DataFrame(data, columns=["text", "label"])
    df.to_csv(file_name, index=False, sep=',')

def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def get_embedding(dataset, embedding_object):
    embeddings = []
    for data in dataset["text"]:
        embedded_text = embedding_object.encode(data)
        embeddings.append(embedded_text)
    
    dataset["embeddings"] = embeddings


In [10]:
financial_phrase_bank_df = load_data(os.path.join("base-data", "FinancialPhraseBank", "all-data-75-above.csv"))
zeroshot_train_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_train.csv"))
zeroshot_valid_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_valid.csv"))

In [18]:
tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/text-embedding-ada-002')

tokenizer_config.json: 100%|██████████| 233/233 [00:00<00:00, 77.3kB/s]
vocab.json: 100%|██████████| 2.01M/2.01M [00:00<00:00, 5.82MB/s]
merges.txt: 100%|██████████| 917k/917k [00:00<00:00, 7.98MB/s]
tokenizer.json: 100%|██████████| 4.23M/4.23M [00:00<00:00, 26.7MB/s]
special_tokens_map.json: 100%|██████████| 98.0/98.0 [00:00<00:00, 32.6kB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT4Tokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.


In [32]:
get_embedding(financial_phrase_bank_df, tokenizer)

In [None]:
hopsworks_project = hopsworks.login() 
fs = hopsworks_project.get_feature_store()