#### Lib Import

In [4]:
import duckdb
import pandas as pd
import concurrent.futures
import numpy as np
import nltk
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from keybert import KeyBERT
from typing import List
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

#### Model Import

In [3]:
tonenizer = BertTokenizer.from_pretrained('lucas-leme/FinBERT-PT-BR')
model = BertForSequenceClassification.from_pretrained('lucas-leme/FinBERT-PT-BR')
title_embedd_model = SentenceTransformer('all-MiniLM-L6-v2')

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
kw_model = KeyBERT()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\julio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Pipeline Definition

In [3]:
sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tonenizer)

Device set to use cpu


#### Feature Store Functions

In [8]:
def get_sentiment(text:str):
    if text is None:
        return "Neural", 0
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']


def process_sentiment_batch(titles_batch):
    titles_batch = list(titles_batch)
    results = sentiment_pipeline(titles_batch)
    sentiment_label = [result['label'] for result in results]
    sentiment_scores = [result['score'] for result in results]
    return sentiment_label, sentiment_scores


def get_keywords(text:str) -> List[tuple]:
    result = kw_model.extract_keywords(
        docs=text,
        stop_words=stopwords,
        use_maxsum=True,
        keyphrase_ngram_range=(1, 1), 
        nr_candidates=20, 
        top_n=5
    )
    return result


def process_keywords_batch(titles_batch):
    titles_batch = list(titles_batch)
    batch_results = [get_keywords(title) for title in titles_batch]
    return batch_results


def embbed_title(df: pd.DataFrame, model_to_embbed: SentenceTransformer, column_to_embbed: str, target_column: str) -> pd.DataFrame:
    # Clean and process the title column
    df['cleaned_title'] = df['title'].str.strip()
    df[target_column] = list(model_to_embbed.encode(df['cleaned_title'].tolist(), convert_to_tensor=False))
    
    return df


def drop_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    dropped_df = df.drop(columns=columns_to_drop, axis=1)
    return dropped_df

### Data processing with Duckdb

#### Title Sentiment Analysis

In [5]:
conn = duckdb.connect()
filepath = 'local/itens.parquet'
df = conn.execute(f"SELECT * FROM '{filepath}'").fetch_df()

In [6]:
batch_size = 100
batches = [df['title'][i:i + batch_size] for i in range(0, len(df), batch_size)]

In [7]:
sentiment_labels = []
sentiment_scores = []

In [8]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    for labels, scores in tqdm(executor.map(process_sentiment_batch, batches), total=len(batches), desc='Processing Sentiment Analysis', unit='batch'):
        sentiment_labels.extend(labels)
        sentiment_scores.extend(scores)

Processing Sentiment Analysis: 100%|██████████| 2557/2557 [1:33:49<00:00,  2.20s/batch]


In [None]:
df_backup = df.copy()

In [10]:
df['title_sentiment_label'] = sentiment_labels
df['title_sentiment_score'] = sentiment_scores
df.head()

Unnamed: 0,page,url,issued,modified,title,body,caption,title_sentiment_label,title_sentiment_score
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,2023-04-15 00:02:08+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"Após audiência de custódia, a Justiça do Amazo...",Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,2019-06-20 17:19:52+00:00,2023-06-16 20:19:15+00:00,Linguajar dos santarenos é diferenciado e chei...,Vista aérea de Santarém\nÁdrio Denner/ AD Prod...,As expressões santarenas não significam apenas...,NEUTRAL,0.450133
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,2022-07-08 08:55:52+00:00,2023-04-15 04:25:39+00:00,Ex-premiê Shinzo Abe morre após ser baleado no...,Novo vídeo mostra que assassino de Shinzo Abe ...,Ex-primeiro-ministro foi atingido por tiros de...,NEGATIVE,0.840971
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,2021-09-09 19:06:46+00:00,2023-06-07 17:44:54+00:00,"Relator no STF, Fachin vota contra marco tempo...","Relator no STF, Fachin vota contra marco tempo...",Ministro defendeu que posse indígena é diferen...,NEGATIVE,0.721313
4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,2021-09-15 19:16:13+00:00,2023-06-07 17:43:39+00:00,"\nApós 2 votos, pedido de vista suspende julga...",Após um pedido de vista (mais tempo para análi...,"Pelo marco temporal, índios só podem reivindic...",NEGATIVE,0.844091


In [11]:
out_filepath = 'local/itens_1.parquet'
df.to_parquet(out_filepath, index=False)

#### Caption Sentiment Analysis

In [12]:
conn = duckdb.connect()
filepath_1 = 'local/itens_1.parquet'
df = conn.execute(f"SELECT * FROM '{filepath_1}'").fetch_df()

In [15]:
df.head()

Unnamed: 0,page,url,issued,modified,title,body,caption,title_sentiment_label,title_sentiment_score
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,2023-04-15 00:02:08+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"Após audiência de custódia, a Justiça do Amazo...",Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,2019-06-20 17:19:52+00:00,2023-06-16 20:19:15+00:00,Linguajar dos santarenos é diferenciado e chei...,Vista aérea de Santarém\nÁdrio Denner/ AD Prod...,As expressões santarenas não significam apenas...,NEUTRAL,0.450133
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,2022-07-08 08:55:52+00:00,2023-04-15 04:25:39+00:00,Ex-premiê Shinzo Abe morre após ser baleado no...,Novo vídeo mostra que assassino de Shinzo Abe ...,Ex-primeiro-ministro foi atingido por tiros de...,NEGATIVE,0.840971
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,2021-09-09 19:06:46+00:00,2023-06-07 17:44:54+00:00,"Relator no STF, Fachin vota contra marco tempo...","Relator no STF, Fachin vota contra marco tempo...",Ministro defendeu que posse indígena é diferen...,NEGATIVE,0.721313
4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,2021-09-15 19:16:13+00:00,2023-06-07 17:43:39+00:00,"\nApós 2 votos, pedido de vista suspende julga...",Após um pedido de vista (mais tempo para análi...,"Pelo marco temporal, índios só podem reivindic...",NEGATIVE,0.844091


In [13]:
batch_size = 100
batches = [df['caption'][i:i + batch_size] for i in range(0, len(df), batch_size)]

In [16]:
caption_sentiment_labels = []
caption_sentiment_scores = []

In [17]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    for labels, scores in tqdm(executor.map(process_sentiment_batch, batches), total=len(batches), desc='Processing Sentiment Analysis', unit='batch'):
        sentiment_labels.extend(labels)
        sentiment_scores.extend(scores)

Processing Sentiment Analysis: 100%|██████████| 2557/2557 [1:39:59<00:00,  2.35s/batch]  


In [18]:
df['caption_sentiment_label'] = caption_sentiment_labels
df['caption_sentiment_score'] = caption_sentiment_scores
df.head()

Unnamed: 0,page,url,issued,modified,title,body,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,2023-04-15 00:02:08+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"Após audiência de custódia, a Justiça do Amazo...",Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053,NEGATIVE,0.60757
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,2019-06-20 17:19:52+00:00,2023-06-16 20:19:15+00:00,Linguajar dos santarenos é diferenciado e chei...,Vista aérea de Santarém\nÁdrio Denner/ AD Prod...,As expressões santarenas não significam apenas...,NEUTRAL,0.450133,NEUTRAL,0.68394
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,2022-07-08 08:55:52+00:00,2023-04-15 04:25:39+00:00,Ex-premiê Shinzo Abe morre após ser baleado no...,Novo vídeo mostra que assassino de Shinzo Abe ...,Ex-primeiro-ministro foi atingido por tiros de...,NEGATIVE,0.840971,NEGATIVE,0.845363
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,2021-09-09 19:06:46+00:00,2023-06-07 17:44:54+00:00,"Relator no STF, Fachin vota contra marco tempo...","Relator no STF, Fachin vota contra marco tempo...",Ministro defendeu que posse indígena é diferen...,NEGATIVE,0.721313,NEUTRAL,0.766454
4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,2021-09-15 19:16:13+00:00,2023-06-07 17:43:39+00:00,"\nApós 2 votos, pedido de vista suspende julga...",Após um pedido de vista (mais tempo para análi...,"Pelo marco temporal, índios só podem reivindic...",NEGATIVE,0.844091,NEUTRAL,0.708964


In [19]:
out_filepath = 'local/itens_2.parquet'
df.to_parquet(out_filepath, index=False)

#### Body Keywords

In [4]:
conn = duckdb.connect()
filepath_1 = 'local/itens_2.parquet'
df = conn.execute(f"SELECT * FROM '{filepath_1}'").fetch_df()

In [5]:
test = df.head(50)

In [13]:
batch_size = 100
batches_body = [df['body'][i:i + batch_size] for i in range(0, len(df), batch_size)]
keywords_list = []

In [None]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    for batch_keywords in tqdm(
        executor.map(process_keywords_batch, batches_body),
        total=len(batches_body),
        desc='Processing Keyword Extraction',
        unit='batch'
    ):
        keywords_list.extend(batch_keywords)

Processing Keyword Extraction:   2%|▏         | 48/2557 [15:04<12:35:54, 18.08s/batch] 

#### Itens Text Pipeline

In [7]:
target_column = 'embbed_title'
column_to_embbed = 'cleaned_title'
itens_columns_to_drop_text = ['modified', 'body', 'url', 'issued', 'title']


In [9]:
itens_pipeline_text = Pipeline(
    steps=[
        ('create_embbeding', FunctionTransformer(
            embbed_title, 
            kw_args={
                'model_to_embbed': title_embedd_model,
                'column_to_embbed': column_to_embbed,
                'target_column': target_column
        })),
        ('drop_unused_columns', FunctionTransformer(
            drop_columns, kw_args={'columns_to_drop': itens_columns_to_drop_text}
        ))
    ]
)

In [10]:
conn = duckdb.connect()
filepath_1 = 'local/itens_2.parquet'
df = conn.execute(f"SELECT * FROM '{filepath_1}'").fetch_df()

In [11]:
text_pipeline = itens_pipeline_text.transform(df)
text_pipeline.head()

Unnamed: 0,page,caption,title_sentiment_label,title_sentiment_score,caption_sentiment_label,caption_sentiment_score,cleaned_title,embbed_title
0,13db0ab1-eea2-4603-84c4-f40a876c7400,Jeferson da Silva Lima foi escoltado por agent...,NEGATIVE,0.813053,NEGATIVE,0.60757,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"[0.028708808, 0.07910229, -0.04915501, 0.02104..."
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,As expressões santarenas não significam apenas...,NEUTRAL,0.450133,NEUTRAL,0.68394,Linguajar dos santarenos é diferenciado e chei...,"[0.05066423, 0.053637918, -0.06246449, -0.0591..."
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,Ex-primeiro-ministro foi atingido por tiros de...,NEGATIVE,0.840971,NEGATIVE,0.845363,Ex-premiê Shinzo Abe morre após ser baleado no...,"[-0.04080393, 0.08898491, -0.041833814, -0.010..."
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,Ministro defendeu que posse indígena é diferen...,NEGATIVE,0.721313,NEUTRAL,0.766454,"Relator no STF, Fachin vota contra marco tempo...","[-0.02016749, 0.13216536, 0.03784579, -0.01504..."
4,9dff71eb-b681-40c7-ac8d-68017ac36675,"Pelo marco temporal, índios só podem reivindic...",NEGATIVE,0.844091,NEUTRAL,0.708964,"Após 2 votos, pedido de vista suspende julgame...","[0.031508345, 0.10205304, 0.03989872, 0.017708..."


In [12]:
out_filepath = 'local/itens_text.parquet'
text_pipeline.to_parquet(out_filepath, index=False)