In [1]:
%reload_ext autoreload
%autoreload 2

In [24]:
import json

from typing import Optional

import sqlite3
import instructor
import pandas as pd

from tqdm import tqdm
from openai import OpenAI
from pydantic import ValidationError

from elections import constants
from elections.utils import full_logger
from elections.data_schemas import ArticleSentiment
from elections.scrapers.news_scraper import NewsScraper
from elections.prompts.templates import sentiment_template


#logger = full_logger(constants.LOG_LVL, constants.SENTIMENT_LOG_FN, to_console=False)


class SentimentAnalysis:
    def __init__(self):
        self.client = instructor.patch(OpenAI())
        self.articles_df = pd.DataFrame()
        self.sentiments = []
        self.articles_counter = 0
        
    def load_articles(self, n_articles=None, refresh=False, query=None) -> None:
        if query is not None:
            self.articles_df = NewsScraper.load_articles(query)
            return
        if refresh:
            query = "SELECT article_id, title, description, text FROM articles"
        else:
            query = """
                SELECT 
                    atc.article_id, title, description, text
                FROM articles atc
                LEFT JOIN article_sentiments atc_s
                ON atc.article_id = atc_s.article_id
                WHERE atc_s.analysis IS NULL
            """
        if n_articles is not None:
            query = f"{query} LIMIT {n_articles}"
        self.articles_df = NewsScraper.load_articles(query)
    
    def get_article_sentiment(self, title, description, text) -> pd.DataFrame:
        system_prompt = sentiment_template.SYSTEM_PROMPT
        user_prompt = sentiment_template.USER_PROMPT.format(title=title, description=description, text=text)
        
        try:
            # to see the raw response: resp._raw_response.model_dump_json(indent=2)
            resp = self.client.chat.completions.create(
                model=constants.OPENAI_GPT_MODEL,
                response_model=ArticleSentiment,
                max_retries=constants.MAX_RETRIES,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
            )
        except ValidationError:
            resp = None
        
        return pd.DataFrame({"analysis": [resp], "system_prompt": [system_prompt], "user_prompt": [user_prompt]})
        
    
    def get_sentiments(self, freq=5, save=True) -> Optional[pd.DataFrame]:
        assert not self.articles_df.empty, "No articles loaded"
        N = len(self.articles_df)
        
        if save:
            engine = sqlite3.connect(constants.NEWS_DB)
        
        self.sentiments = []
        for i, row in tqdm(self.articles_df.reset_index().iterrows(), total=N):
            #if i % freq == 0:
                #logger.info(f"Processing article {i + 1} of {N}")
                #print(f"Processing article {i + 1} of {N}")
            sentiment = self.get_article_sentiment(row["title"], row["description"], row["text"])
            sentiment.insert(loc=0, column="article_id", value=row["article_id"])
            if not sentiment.empty:
                self.sentiments.append(sentiment)
                self.articles_counter += 1
                if self.articles_counter % freq == 0 or i == N - 1:
                    if save:
                        self._save_sentiments(engine)
                    else:
                        #logger.info(f"Extracted {self.articles_counter} of {N}")
                        print(f"Extracted {self.articles_counter} of {N}")
        if save:
            engine.close()
            return None
        
        return self.sentiments
    
    def _save_sentiments(self, engine) -> None:
        sentiments_df = pd.concat(self.sentiments)
        sentiments_df["analysis"] = sentiments_df["analysis"].apply(lambda x: x.model_dump_json())
        sentiments_df.to_sql("article_sentiments", con=engine, if_exists="append", index=False)
        self.sentiments = []
        #logger.info(f"Saved in DB {self.articles_counter} of {N}")
        print(f"Saved in DB {self.articles_counter} of {len(self.articles_df)}")
    
    def load_article_sentiments(query) -> pd.DataFrame:
        with sqlite3.connect(constants.NEWS_DB) as engine:
            df = pd.read_sql("SELECT * FROM article_sentiments", con=engine)
        if "analysis" in df.columns:
            df["analysis"] = df["analysis"].apply(lambda x: ArticleSentiment.model_validate_json(x))
        return df
        

        

In [25]:
sentiment_analysis = SentimentAnalysis()
sentiment_analysis.load_articles() #n_articles=2)#, refresh=True)
sentiment_analysis.articles_df

Unnamed: 0,article_id,title,description,text
0,3,Pedro Nuno Santos vs André Ventura: oiça aqui ...,Pedro Nuno Santos vs André Ventura: oiça aqui ...,Ouça o frente a frente entre o secretário-gera...
1,4,"Podcast. Afinal, Pedro Nuno Santos não está de...","Podcast. Afinal, Pedro Nuno Santos não está de...",A maior trapalhada do líder do PS deu espaço a...


In [26]:
sentiment_analysis.get_sentiments(freq=1)

  0%|          | 0/2 [00:00<?, ?it/s]

03/04/2024 04:21:29 PM - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


> [0;32m/var/folders/q3/m5wwnysd3kx0n1_6yvsrqghh0000gn/T/ipykernel_37555/390412390.py[0m(68)[0;36mget_article_sentiment[0;34m()[0m
[0;32m     66 [0;31m            [0mresp[0m [0;34m=[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     67 [0;31m[0;34m[0m[0m
[0m[0;32m---> 68 [0;31m        [0;32mreturn[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m([0m[0;34m{[0m[0;34m"analysis"[0m[0;34m:[0m [0;34m[[0m[0mresp[0m[0;34m][0m[0;34m,[0m [0;34m"system_prompt"[0m[0;34m:[0m [0;34m[[0m[0msystem_prompt[0m[0;34m][0m[0;34m,[0m [0;34m"user_prompt"[0m[0;34m:[0m [0;34m[[0m[0muser_prompt[0m[0;34m][0m[0;34m}[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     69 [0;31m[0;34m[0m[0m
[0m[0;32m     70 [0;31m[0;34m[0m[0m
[0m
ArticleSentiment(sentiments=[Sentiment(name='Pedro Nuno Santos', score=None, citations=[]), Sentiment(name='André Ventura', score=None, citations=[])])
'{\n  "id": "chatcmpl-8z5M7uItpaA1gnYbhZsTHT7WGlWU0",

  0%|          | 0/2 [01:26<?, ?it/s]
