# Encoding sentences using Modern BERT

### Setup

In [1]:
#!pip install transformers==4.48.0
#!pip install numpy==1.24.1

In [17]:
import sys
sys.path.append('../')
import numpy as np
import pandas as pd
import torch
from functools import partial
import gc
import os
from datasets import load_dataset
from src.embeddings import ModernBertEncoding
from sklearn.metrics import f1_score
from tqdm import tqdm
from transformers import AutoTokenizer, ModernBertModel
from sklearn.metrics.pairwise import cosine_similarity
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Libraries options
tqdm.pandas()
pd.options.display.max_rows = 999
pd.set_option('max_colwidth', 1000)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Data Acess

In [2]:
# lendo o dado processado no notebook Datasets/data_treatment.ipynb
dataframe = pd.read_parquet('../Datasets/pre_processed_dataframes/embeddings_df.parquet')
len(dataframe)

5070

In [3]:
dataframe[['description', 'cleaned_description']].sample()

Unnamed: 0,description,cleaned_description
4512,The Hidden Treasures trio will let you put literal works of art on your wrist.,hidden treasures trio let put literal works art wrist


In [4]:
dataframe[['full_description', 'cleaned_full_description']].sample()


Unnamed: 0,full_description,cleaned_full_description
3399,"S&P has upgraded Netflix to investment grade, admiring the streamer’s financial profile as it continues to lead the market in an increasingly competitive OTT landscape. The ratings agency said Monday it sees the company remaining the dominant player and it took its senior unsecured debt rating from BB+ to BBB blue chip status with outlook stable. Shares of the company behind Squid Game are up about 1.4% at $674. In its report, the agency gave a shout-out to solid revenue growth, rising margins, reduced leverage and positive free operating cash flow in 2022 and beyond. It said its assessment of Netflix is now similar to how it sees Disney’s business risk, although it still views the latter’s overall business as more favorable given iconic franchises monetized across movies, television and theme parks. Related Story 'Virgin River': Mark Ghanimé & Kai Bradbury Join Season 4 As New Series Regulars Netflix’s massive investments in content to get where it is today had led it to weaker ca...",sp upgraded netflix investment grade admiring streamers financial profile continues lead market increasingly competitive ott landscape ratings agency said monday sees company remaining dominant player took senior unsecured debt rating bb bbb blue chip status outlook stable shares company behind squid game report agency gave shoutout solid revenue growth rising margins reduced leverage positive free operating cash flow beyond said assessment netflix similar sees disneys business risk although still views latters overall business favorable given iconic franchises monetized across movies television theme parks related story virgin river mark ghanim kai bradbury join season new series regulars netflixs massive investments content get today led weaker cash flow compared entertainment rivals spending begun moderate peers still early stages building scaled ott services netflix said latest earnings last week expects sustainable positive free cash flow starting back january noted would long...


### Encoding Data

In [5]:
m_bert = ModernBertEncoding(device=device)

In [6]:
embeddings_1 = m_bert.encode('trabalhando com codificação e similaridade de texto')
embeddings_2 = m_bert.encode('working on coding and text similarity')

In [11]:
cosine_similarity([embeddings_1], [embeddings_2])[0][0]

0.79073924

In [13]:
dataframe['m_bert_embeddings_description'] = dataframe['cleaned_description'].progress_apply(m_bert.encode)

100%|██████████| 5070/5070 [05:48<00:00, 14.53it/s]


In [14]:
dataframe['m_bert_embeddings_full_description'] = dataframe['cleaned_full_description'].progress_apply(m_bert.encode)

100%|██████████| 5070/5070 [40:24<00:00,  2.09it/s]  


#### Save Processed Dataframe

In [16]:
dataframe.to_parquet('../Datasets/pre_processed_dataframes/embeddings_df.parquet', index=False)