In [1]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import torch
import hopsworks
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/text-embedding-ada-002')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT4Tokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.


## Functions

In [3]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def get_embedding(dataset, embedding_object):
    embeddings = []
    for data in dataset["text"]:
        embedded_text = embedding_object.encode(data)
        embeddings.append(embedded_text)

    dataset_embedded = dataset.copy()
    dataset_embedded["embeddings"] = embeddings
    dataset_embedded = dataset_embedded.drop(columns=["text"])
    return dataset_embedded

In [4]:
financial_phrase_bank_df = load_data(os.path.join("base-data", "FinancialPhraseBank", "all-data-75-above.csv"))
zeroshot_train_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_train.csv"))
zeroshot_test_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_test.csv"))

In [5]:

# Assuming df1, df2, df3 are your dataframes
df = pd.concat([financial_phrase_bank_df, zeroshot_train_df, zeroshot_test_df])

# Get the count of each label
label_counts = df['label'].value_counts()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, stratify=df['label'])

# Now, X_train and y_train contain the training data and their corresponding labels
# X_test and y_test contain the test data and their corresponding labels

## Sanity check of label distribution

In [7]:
y_train_df = pd.DataFrame(y_train)
train_value_counts= y_train_df.value_counts()
# Get the percentage of each label in the training data
print(train_value_counts / train_value_counts.sum())

y_test_df = pd.DataFrame(y_test)
test_value_counts= y_test_df.value_counts()
print(test_value_counts / test_value_counts.sum())


label
2        0.642886
1        0.213537
0        0.143577
Name: count, dtype: float64
label
2        0.642834
1        0.213520
0        0.143646
Name: count, dtype: float64


In [11]:
train_dataset_df = pd.concat([X_train, y_train], axis=1)
test_dataset_df = pd.concat([X_test, y_test], axis=1)

train_dataset_df_embedded = get_embedding(train_dataset_df, tokenizer)
test_dataset_df_embedded = get_embedding(test_dataset_df, tokenizer)

train_dataset_df_embedded

Unnamed: 0,label,embeddings
3010,2,"[791, 2883, 706, 264, 19815, 1205, 369, 95851,..."
4845,2,"[41651, 2467, 47738, 483, 11, 279, 1176, 8954,..."
7246,2,"[4438, 690, 279, 11650, 66512, 6541, 279, 6355..."
1475,2,"[37, 26919, 75967, 551, 323, 12584, 267, 84397..."
238,1,"[35982, 3105, 364, 82, 4272, 11626, 369, 279, ..."
...,...,...
2693,2,"[791, 6130, 374, 2254, 2883, 469, 51137, 75142..."
6675,0,"[6, 47, 32370, 12, 38837, 6, 27851, 8813, 6740..."
5307,2,"[2028, 374, 1405, 330, 32132, 596, 7054, 480, ..."
3123,2,"[59247, 4814, 27212, 520, 38188, 2137, 76, 273..."


In [13]:
hopsworks_project = hopsworks.login() 
fs = hopsworks_project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/197784
Connected. Call `.close()` to terminate connection gracefully.


In [16]:
fg_train = fs.get_or_create_feature_group(name="news_sentiment_traindata", version=1, description="Training data and labels for financial news sentiment prediction model", primary_key=["label", "embeddings"], online_enabled=True)
fg_train.insert(train_dataset_df_embedded)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/197784/fs/197703/fg/324944


Uploading Dataframe: 100.00% |██████████| Rows 12307/12307 | Elapsed Time: 00:09 | Remaining Time: 00:00


Launching job: news_sentiment_traindata_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/197784/jobs/named/news_sentiment_traindata_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x1dd242dbf40>, None)

In [17]:
fg_test = fs.get_or_create_feature_group(name="news_sentiment_testdata", version=1, description="Test data and labels for financial news sentiment prediction model", primary_key=["label", "embeddings"], online_enabled=True)
fg_test.insert(test_dataset_df_embedded)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/197784/fs/197703/fg/322923


Uploading Dataframe: 100.00% |██████████| Rows 3077/3077 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: news_sentiment_testdata_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/197784/jobs/named/news_sentiment_testdata_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x1dd2438a7c0>, None)