In [1]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import torch
import hopsworks


  from .autonotebook import tqdm as notebook_tqdm


## Functions

In [17]:
def create_sentiment_csv(file_path, file_name='sentiment.csv'):
    sentiment_map = {"negative": 0, "positive": 1, "neutral": 2}
    data = []

    with open(file_path, 'r', encoding="latin1") as file:
        for line in file:
            sentence, sentiment = line.split("@")
            sentiment = sentiment.strip()  # remove any trailing whitespace
            data.append([sentence, sentiment_map[sentiment]])

    df = pd.DataFrame(data, columns=["text", "label"])
    df.to_csv(file_name, index=False, sep=',')

def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def get_embedding(dataset, embedding_object):
    embeddings = []
    for data in dataset["text"]:
        embedded_text = embedding_object.encode(data)
        embeddings.append(embedded_text)

    dataset_embedded = dataset.copy()
    dataset_embedded["embeddings"] = embeddings
    dataset_embedded = dataset_embedded.drop(columns=["text"])
    return dataset_embedded

def get_decoding(dataset, embedding_object):
    decodings = []
    for data in dataset["embeddings"]:
        decoded_text = embedding_object.decode(data)
        decodings.append(decoded_text)

    dataset_decoded = dataset.copy()
    dataset_decoded["text"] = decodings
    dataset_decoded = dataset_decoded.drop(columns=["embeddings"])
    return dataset_decoded

In [3]:
financial_phrase_bank_df = load_data(os.path.join("base-data", "FinancialPhraseBank", "all-data-75-above.csv"))
zeroshot_train_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_train.csv"))
zeroshot_valid_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_valid.csv"))

In [4]:
tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/text-embedding-ada-002')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT4Tokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.


In [5]:
embedded_financial_phrase_bank_df = get_embedding(financial_phrase_bank_df, tokenizer)

In [6]:
embedded_financial_phrase_bank_df

Unnamed: 0,label,embeddings
0,2,"[11439, 311, 27622, 1174, 279, 2883, 706, 912,..."
1,1,"[2409, 279, 502, 5788, 6136, 279, 2883, 1053, ..."
2,1,"[2520, 279, 1566, 8502, 315, 220, 508, 605, 11..."
3,1,"[644, 279, 4948, 8502, 315, 220, 508, 605, 117..."
4,1,"[59247, 11626, 16392, 311, 38188, 220, 1032, 1..."
...,...,...
3448,0,"[59247, 1121, 369, 279, 220, 717, 12, 10460, 4..."
3449,0,"[1837, 7416, 11898, 40, 65332, 17961, 482, 452..."
3450,0,"[43, 87228, 8152, 14581, 1198, 12037, 7729, 96..."
3451,0,"[59247, 11626, 11299, 311, 38188, 220, 1758, 1..."


In [7]:
hopsworks_project = hopsworks.login() 
fs = hopsworks_project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/197784
Connected. Call `.close()` to terminate connection gracefully.


In [10]:
embedding_fg = fs.get_or_create_feature_group(name="test", version=1, description="test", primary_key=["label", "embeddings"], online_enabled=True)
embedding_fg.insert(embedded_financial_phrase_bank_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/197784/fs/197703/fg/322918


Uploading Dataframe: 100.00% |██████████| Rows 3453/3453 | Elapsed Time: 00:06 | Remaining Time: 00:00


Launching job: test_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/197784/jobs/named/test_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x1549023caf0>, None)

In [12]:
connection = hopsworks.hsfs.connection()
fs = connection.get_feature_store(name="id2223labs_featurestore")
fg = fs.get_feature_group('test', version=1)

Connected. Call `.close()` to terminate connection gracefully.


In [16]:
temp = fg.select(["embeddings", "label"]).show(5)

Finished: Reading data from Hopsworks, using ArrowFlight (1.43s) 


From the original data:

The company's order book stood at 1.5 bln euro \$ 2.2 bln on September 30, 2007, up by 24.2 pct on the year, with international orders amounting to 365 mln euro \$ 534.3 mln.

From the embedded data from the feature store:

In [25]:
decoded = get_decoding(temp, tokenizer)
print(decoded["text"][0])

The company's order book stood at 1.5 bln euro $ 2.2 bln on September 30, 2007, up by 24.2 pct on the year, with international orders amounting to 365 mln euro $ 534.3 mln.
