# Encoding sentences using Google BERT

### Setup

In [1]:
import sys
sys.path.append('../')
from src.embeddings import BertEncoding
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# Libraries options
pd.options.display.max_rows = 999
pd.set_option('max_colwidth', 1000)

  from scipy.sparse.base import spmatrix


### Data Acess

In [2]:
# lendo o dado processado no notebook Datasets/data_treatment.ipynb
dataframe = pd.read_parquet('../Datasets/pre_processed_dataframes/clean_df.parquet')
len(dataframe)

5070

In [3]:
dataframe[['description', 'cleaned_description']].sample()

Unnamed: 0,description,cleaned_description
1037,"Con la terza generazione, gli auricolari della Mela cambiano design e ai arricchiscono di nuove funzioni, come lo Spatial Audio. Non hanno la cancellazione del rumore, ma per il resto rimangono tra le migliori scelte possibili per chi usa iPhone o altri dispositivi Apple",terza generazione auricolari mela cambiano design arricchiscono nuove funzioni spatial audio cancellazione rumore resto rimangono migliori scelte possibili usa iphone altri dispositivi apple


In [4]:
dataframe[['full_description', 'cleaned_full_description']].sample()


Unnamed: 0,full_description,cleaned_full_description
3690,"The Duke Blue Devils (3-4, 0-0 ACC) are 16.5-point underdogs in a road ACC matchup with the No. 13 Wake Forest Demon Deacons (7-0, 0-0 ACC) on Saturday, October 30, 2021 at Truist Field. An over/under of 70 is set for the contest. For more great betting and fantasy insight, join the SI Winners Club Newsletter . 43.1 Avg. Points Scored 27 26.3 Avg. Points Allowed 31.6 469.6 Avg. Total Yards 472.3 437.4 Avg. Total Yards Allowed 443.3 6 Giveaways 14 16 Takeaways 10",duke blue devils acc point underdogs road acc matchup wake forest demon deacons acc saturday october truist field overunder set contest great betting fantasy insight join si winners club newsletter avg points scored avg points allowed avg total yards avg total yards allowed giveaways takeaways


### Encoding Data

In [None]:
bert = BertEncoding(model_path="google-bert/bert-base-multilingual-uncased")

In [10]:
embeddings_1 = bert.encode('trabalhando com codificação e similaridade de texto')
embeddings_2 = bert.encode('working on coding and text similarity')

cosine_similarity([embeddings_1], [embeddings_2])[0][0]

0.95860857

In [11]:
dataframe['embeddings_description'] = dataframe['cleaned_description'].progress_apply(bert.encode)

100%|██████████| 5070/5070 [02:58<00:00, 28.40it/s]


In [12]:
dataframe['embeddings_full_description'] = dataframe['cleaned_full_description'].progress_apply(bert.encode)

100%|██████████| 5070/5070 [19:22<00:00,  4.36it/s]


In [16]:
dataframe[['embeddings_description', 'cleaned_description']].sample()

Unnamed: 0,embeddings_description,cleaned_description
643,"[0.13930863, 0.02107478, 0.17793927, 0.13717964, 0.19489713, 0.40748084, 0.15471444, -0.109299876, -0.15556882, 0.25827262, -0.22628173, -0.19271514, 0.24328078, -0.15290435, -0.19517908, 0.05396582, 0.1683832, 0.10641208, 0.24336971, 0.10384193, -0.047929145, -0.087368645, 0.11448673, -0.016624652, 0.2564484, -0.12062563, 0.2663201, 0.122429, 0.33047232, 0.2845122, 0.2219186, 0.22002906, 0.23417999, -0.08621383, 0.20073403, -0.04206367, 0.0029858006, 0.12263694, 0.24665397, 0.063056156, 0.116892956, 0.39674792, 0.14863832, -0.09163023, -0.3144099, 0.19757235, -0.17352544, -0.08221583, 0.9999861, 0.17563765, 0.10817127, -0.13731626, 0.096274935, -0.2991946, 0.23775183, 0.99999034, -0.32492906, -0.24051991, 0.022516152, -0.11128705, -0.17710134, 0.07313303, 0.27487308, 0.12452492, -0.10343348, 0.05483115, -0.09359155, 0.34586966, 0.018977689, 0.11809938, 0.08833186, -0.10347227, 0.21836795, 0.2974961, -0.14496973, 0.06181199, -0.16611166, -0.1412799, -0.14549284, 0.17117874, 0.17288...",lots unpack


#### Save Processed Dataframe

In [14]:
dataframe.to_parquet('../Datasets/pre_processed_dataframes/embeddings_df.parquet', index=False)