In [1]:
%reload_ext autoreload
%autoreload 2

In [276]:
import numpy as np
import pandas as pd

from elections.data_schemas import ArticleSentiment
from elections.sentiment_analysis import SentimentAnalysis

In [248]:
sentiments_df = SentimentAnalysis.load_article_sentiments()

In [233]:
import sqlite3
from elections import constants

query = "SELECT * FROM article_sentiments"

with sqlite3.connect(constants.NEWS_DB) as engine:
    df = pd.read_sql(query, con=engine)

In [246]:
ArticleSentiment.model_validate_json(
    df[df.article_id == 2].analysis.iloc[0]
)

ArticleSentiment(sentiments=[Sentiment(name='Pedro Nuno Santos', score=0.6, citations=[Citation(quote='Pedro Nuno Santos tenta o equilíbrio impossível entre a defesa de uma herança — com tantos ministros a dar a cara, seria difícil de outra forma — que deixou a saúde, a educação e a habitação em situação muito difícil, e a mudança. Mas inova pouco, tem pouca ambição, joga pelo seguro, mas não entusiasma', score=0.7, author=None), Citation(quote='Pedro Nuno Santos vai apostar tudo no voto útil, já se viu isso nos debates com os seus ‘camaradas’ de esquerda, o próprio pressente que está no limite do seu potencial eleitoral e começou a dramatização', score=0.5, author='Text')]), Sentiment(name='Luís Montenegro', score=0.7, citations=[Citation(quote='A primeira semana de debates, não é arriscado dizer, foi favorável a Luís Montenegro. Embalado pelas sondagens, conseguiu marcar pontos em todos os debates realizados, esteve seguro, sem particulares rasgos, mas consistente', score=0.8, author

In [247]:
import pandas as pd


def expand_analysis(row):
    analysis = []
    sentiments = row["analysis"].sentiments
    if not sentiments:
        return pd.DataFrame([{"quote": None, "score": None, "name": None, "article_id": row["article_id"]}])
    for sent in sentiments:
        if sent.citations:
            df = pd.DataFrame(
                [{"quote": cite.quote, "quote_score": cite.score} for cite in sent.citations]
            )
        else:
            df = pd.DataFrame([{"quote": None, "score": None}])
        df["name"] = sent.name
        df["score"] = sent.score
        df["article_id"] = row["article_id"]     
        analysis.append(df)
    
    analysis = [df.dropna(axis=1, how='all') for df in analysis]
    return pd.concat(analysis, ignore_index=True)
        

In [251]:
sentiments_df.article_id.nunique()

686

In [261]:
sentiments_df.loc[~article_with_sentiments]

Unnamed: 0,sentiment_id,creation_datetime,article_id,analysis,system_prompt,user_prompt,error_message
22,25,2024-03-05 16:23:16,23,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\nR.A.P. sobre Pedro ...","[{'type': 'value_error', 'loc': ['sentiments',..."
27,30,2024-03-05 16:23:44,28,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\nAfinal, o que disse...","[{'type': 'value_error', 'loc': ['sentiments',..."
35,38,2024-03-05 16:24:43,36,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\nPedro Nuno Santos c...","[{'type': 'missing', 'loc': ['sentiments', 2, ..."
41,44,2024-03-05 16:25:13,42,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\nPedro Nuno Santos, ...","[{'type': 'float_type', 'loc': ['sentiments', ..."
47,50,2024-03-05 16:26:41,48,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\nPedro Nuno Santos v...","[{'type': 'value_error', 'loc': ['sentiments',..."
...,...,...,...,...,...,...,...
642,645,2024-03-05 17:26:21,643,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\nMariana Mortágua dá...","[{'type': 'value_error', 'loc': ['sentiments',..."
658,661,2024-03-05 17:27:39,659,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\nInês Sousa Real dei...","[{'type': 'value_error', 'loc': ['sentiments',..."
670,673,2024-03-05 17:28:18,671,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\nDEBATES 2024. Inês ...","[{'type': 'value_error', 'loc': ['sentiments',..."
680,683,2024-03-05 17:29:19,681,,# TASK\nYou will be provided with a document d...,"# ARTICLE:\n""""""\n## Title\n“A esquerda tem de ...","[{'type': 'value_error', 'loc': ['sentiments',..."


In [249]:
article_with_sentiments = ~sentiments_df["analysis"].isnull()

In [253]:
article_with_sentiments = ~sentiments_df["analysis"].isnull()
expanded = sentiments_df.loc[article_with_sentiments, ["article_id", "analysis"]].apply(expand_analysis, axis=1)
expanded_sentiments = pd.concat([df.dropna(axis=1, how='all') for df in expanded], ignore_index=True)
expanded_sentiments

Unnamed: 0,quote,quote_score,name,score,article_id
0,Luís Montenegro manteve aquilo que faz dele um...,0.8,Luís Montenegro,0.5,1
1,Pedro Nuno Santos disse: ‘Eu não negoceio sobr...,0.3,Luís Montenegro,0.5,1
2,Pedro Nuno Santos esteve a descansar este temp...,0.7,Pedro Nuno Santos,0.6,1
3,Pedro Nuno Santos entrou ao ataque. A frase da...,0.7,Pedro Nuno Santos,0.6,1
4,"Pedro Nuno Santos não teve debates brilhantes,...",0.4,Pedro Nuno Santos,0.6,1
...,...,...,...,...,...
1672,"António José Teixeira, diretor de informação d...",0.4,Luís Montenegro,0.5,685
1673,"Perante a sugestão da AD, que 'entendeu propor...",0.3,Luís Montenegro,0.5,685
1674,'O que esteve sempre na base deste modelo foi ...,0.5,Luís Montenegro,0.5,685
1675,,,Paulo Raimundo,,685


In [263]:
expanded_sentiments[expanded_sentiments.article_id == 3]

Unnamed: 0,quote,quote_score,name,score,article_id
13,,,Pedro Nuno Santos,,3
14,,,André Ventura,,3


In [296]:
constants.POLITICIAN_ALIASES

{'Pedro Nuno Santos': ['PNS',
  'P.N.S.',
  'Pedro Nuno',
  'líder do PS',
  'líder do P.S.',
  'líder do Partido Socialista',
  'secretário-geral do PS'],
 'Luís Montenegro': ['Montenegro',
  'presidente do PSD',
  'presidente do P.S.D.',
  'presidente do Partido Social Democrata'],
 'André Ventura': ['Ventura'],
 'Rui Rocha': ['Rocha'],
 'Mariana Mortágua': ['Mortágua',
  'coordenadora do BE',
  'coordenadora do Bloco de Esquerda'],
 'Paulo Raimundo': ['Raimundo', 'secretário-geral do PCP'],
 'Inês de Sousa Real': ['Inês Sousa Real',
  'Sousa Real',
  'porta voz do PAN',
  'porta voz do Partido Aniamis e Natureza'],
 'Rui Tavares': ['Tavares', 'líder do Livre', 'líder do partido Livre']}

In [301]:
alias_to_politician = {alias: politician for politician, aliases 
    in constants.POLITICIAN_ALIASES.items() for alias in aliases
}
alias_to_politician

{'PNS': 'Pedro Nuno Santos',
 'P.N.S.': 'Pedro Nuno Santos',
 'Pedro Nuno': 'Pedro Nuno Santos',
 'líder do PS': 'Pedro Nuno Santos',
 'líder do P.S.': 'Pedro Nuno Santos',
 'líder do Partido Socialista': 'Pedro Nuno Santos',
 'secretário-geral do PS': 'Pedro Nuno Santos',
 'Montenegro': 'Luís Montenegro',
 'presidente do PSD': 'Luís Montenegro',
 'presidente do P.S.D.': 'Luís Montenegro',
 'presidente do Partido Social Democrata': 'Luís Montenegro',
 'Ventura': 'André Ventura',
 'Rocha': 'Rui Rocha',
 'Mortágua': 'Mariana Mortágua',
 'coordenadora do BE': 'Mariana Mortágua',
 'coordenadora do Bloco de Esquerda': 'Mariana Mortágua',
 'Raimundo': 'Paulo Raimundo',
 'secretário-geral do PCP': 'Paulo Raimundo',
 'Inês Sousa Real': 'Inês de Sousa Real',
 'Sousa Real': 'Inês de Sousa Real',
 'porta voz do PAN': 'Inês de Sousa Real',
 'porta voz do Partido Aniamis e Natureza': 'Inês de Sousa Real',
 'Tavares': 'Rui Tavares',
 'líder do Livre': 'Rui Tavares',
 'líder do partido Livre': 'Rui T

In [306]:
expanded_sentiments["name"]

0         Luís Montenegro
1         Luís Montenegro
2       Pedro Nuno Santos
3       Pedro Nuno Santos
4       Pedro Nuno Santos
              ...        
1672      Luís Montenegro
1673      Luís Montenegro
1674      Luís Montenegro
1675       Paulo Raimundo
1676          Rui Tavares
Name: name, Length: 1677, dtype: object

In [312]:
politician_mentions = expanded_sentiments["name"].replace(alias_to_politician).value_counts()
politician_mentions[constants.POLITICIANS].sort_values(ascending=False)

name
Pedro Nuno Santos     319
Mariana Mortágua      251
Luís Montenegro       241
André Ventura         225
Rui Rocha             162
Rui Tavares           146
Paulo Raimundo        136
Inês de Sousa Real     89
Name: count, dtype: int64

In [300]:
expanded_sentiments["name"].value_counts()

name
Pedro Nuno Santos            317
Mariana Mortágua             250
Luís Montenegro              233
André Ventura                220
Rui Rocha                    161
                            ... 
AD                             1
Passos Coelho                  1
Carlos Guimarães Pinto         1
João Cotrim de Figueiredo      1
Matteo Salvini                 1
Name: count, Length: 84, dtype: int64

In [284]:
# articles the have no sentiments
mask_has_score = ~expanded_sentiments["score"].isnull()
article_ids_with_score = expanded_sentiments.loc[mask_has_score, "article_id"].unique()
article_ids_no_score_set = set(expanded_sentiments["article_id"].unique()) - set(article_ids_with_score)
article_ids_no_score = np.sort(list(article_ids_no_score_set))
article_ids_no_score

array([  3,   4,   5,   6,   7,   8,  10,  11,  14,  16,  17,  19,  21,
        25,  26,  27,  29,  30,  34,  35,  37,  40,  43,  45,  53,  54,
        55,  56,  59,  61,  63,  65,  69,  71,  73,  74,  76,  78,  79,
        80,  82,  83,  84,  86,  88,  90,  91,  92,  94,  97,  99, 100,
       103, 104, 107, 110, 111, 117, 118, 120, 122, 123, 124, 125, 126,
       128, 129, 130, 133, 134, 135, 136, 137, 139, 140, 141, 142, 143,
       146, 150, 151, 153, 154, 155, 156, 157, 160, 161, 163, 164, 166,
       169, 170, 172, 173, 175, 177, 178, 179, 180, 184, 185, 186, 188,
       191, 193, 194, 200, 203, 204, 205, 209, 215, 218, 224, 225, 229,
       230, 233, 237, 243, 245, 247, 249, 250, 251, 253, 254, 255, 260,
       261, 263, 264, 267, 269, 271, 272, 274, 277, 279, 287, 288, 292,
       293, 295, 296, 299, 301, 304, 310, 311, 315, 317, 318, 319, 320,
       327, 331, 332, 333, 350, 351, 353, 354, 364, 366, 367, 370, 372,
       373, 376, 381, 382, 383, 385, 387, 389, 390, 391, 392, 39

In [290]:
expanded_sentiments[expanded_sentiments["article_id"].isin(article_ids_no_score)]

Unnamed: 0,quote,quote_score,name,score,article_id
13,,,Pedro Nuno Santos,,3
14,,,André Ventura,,3
15,,,Pedro Nuno Santos,,4
16,,,Montenegro,,4
17,,,Pedro Nuno Santos,,5
...,...,...,...,...,...
1663,,,Luís Montenegro,,680
1664,,,Rui Tavares,,680
1665,,,Luís Montenegro,,682
1666,,,Rui Tavares,,682


In [259]:
expanded_sentiments.loc[expanded_sentiments["score"].isnull(), "article_id"].nunique()

365

In [157]:
print(sentiments_df.loc[sentiments_df.article_id == 2, "system_prompt"].iloc[0])

# TASK
You will be provided with a document delimited by triple quotes and a question. Your task is to answer the question using only the provided document and to cite the passage(s) of the document used to answer the question. If the document does not contain the information needed to answer this question, then simply write: [].

## PROCESS
Take your time to answer the question and go through the following steps:
1. Parse Document: Extract relevant information from the document, such as quotes and mentions of the politicians provided
   by the user
2. Break down passages: if a quote is conveying multiple pieces of information, break it down into smaller parts.
3. Preserve quotes: never alter a passage, even it there are spelling or grammatical mistakes. If you skip part of the text
   then use ... to indicate the skipped part.
4. Filter quotes: For each matched politician, attach in 'citations' all the corresponding quotes that contain an opinion
   (hence not just factual).
5. Scorin