In [1]:
import os
import glob
import pandas as pd
import re

In [2]:
conjunto_dados = pd.DataFrame()

SPEAKER_REGEX = re.compile(r'[A|O] Sr\.ª?( [\w\-,]+)+([\-\w]+)?( \([\w\s\.,\-]+\))*(?=:)')
MULTILINE_SPEAKER_REGEX = re.compile("^" + SPEAKER_REGEX.pattern, flags=re.M) 

for file in os.scandir("./discursos"):
    speech_by_speaker = []

    with open(file, encoding="utf8") as fp:
        content = fp.read()

        #1)remove dates and numbers

        datas_texto = re.compile("\d{1,2} [A-Z]+ [A-Z]+ [A-Z]+ \d{4} (\n\n\d+)?")
        datas_texto.sub(" ", content)

        series_numeros = re.compile("[A-Z]+ SÉRIE \W NÚMERO \d+ (\n+\d+)?")
        series_numeros.sub(" ", content)

        #2) criar dicionario de intervencoes para criar dataframe

        #2.1) identificar os intervenientes e intervencoes correspondentes

        deputadxs = MULTILINE_SPEAKER_REGEX.finditer(content) 
        deputadxs = [match.group() for match in deputadxs]
        deputadxs = list(set(deputadxs))

        #deputadxs = [deputadxs[5]]
        #print(deputadxs)

        content_on_one_line = ''.join(content.splitlines())

        for speaker in deputadxs:
            content_split_by_speaker = content_on_one_line.split(speaker + ':')
            # Get rid of everything before they speak
            content_split_by_speaker.pop(0)

            # what_they_say = [speech for speech in speaker_regex]
            # for each line in content_split_by_speaker
            # use the speaker regex to find the next speaker and get everything before that
            for line in content_split_by_speaker:
                start_of_next_speaker = SPEAKER_REGEX.finditer(line)

                # either an index or None
                # if it's None, either:
                # - the regex isn't accurate enough; or
                # - the speaker is the only person to speak]
                next_speaker_indexes = [match.start() for match in start_of_next_speaker]
                next_speaker_index = next_speaker_indexes[0] if len(next_speaker_indexes) > 0 else None

                speech = line[0:next_speaker_index] if next_speaker_index is not None else line
                speech_by_speaker.append((speaker, speech))

                # if "):" in speech:
                #     print(speech)
        
        df = pd.DataFrame.from_records(speech_by_speaker, columns=["speaker", "speech"])#.reset_index()
        df['filename'] = os.path.basename(file)
        #print(df)
                
        conjunto_dados = pd.concat([conjunto_dados, df])
conjunto_dados.to_pickle('conjunto_dados.pkl')  
#conjunto_dados = pd.read_pickle('conjunto_dados.pkl')          
#conjunto_dados#.to_csv("deputados.csv")#, index=False)
    

In [7]:
conjunto_dados.filename.nunique()

619

In [4]:
conjunto_dados.tail(3)

Unnamed: 0,speaker,speech,filename
211,O Sr. António Filipe (PCP),"— Sr. Presidente, Srs. Membros do Governo, Sr...",darl14sl01n042.txt
212,O Sr. João Pinho de Almeida (CDS-PP),— Muito bem!,darl14sl01n042.txt
213,O Sr. João Pinho de Almeida (CDS-PP),— Muito bem!,darl14sl01n042.txt


In [5]:
conjunto_dados["speech"] = [string.lstrip("— ") for string in conjunto_dados["speech"]]


partidos_raw = conjunto_dados["speaker"].str.split("(", n=1, expand=True)
partidos_raw.columns = ["speaker", "partido"]
partidos_raw["partido"] = partidos_raw["partido"].str.rstrip(")")
partidos_raw["speech"] = conjunto_dados["speech"]
partidos_raw["filename"] = conjunto_dados["filename"]
partidos_raw["speaker"] = partidos_raw["speaker"].str.replace(r"[A|O] Sr\.ª? ", "")
partidos_raw["speaker"] = partidos_raw["speaker"].str.rstrip(" ")
#conjunto_dados = conjunto_dados.merge(partidos_raw, how="right", on="speaker") it was breaking (maybe memory issues?)

#partidos_raw.to_csv("deputados.csv", index=False)

  partidos_raw["speaker"] = partidos_raw["speaker"].str.replace(r"[A|O] Sr\.ª? ", "")


In [6]:
partidos_raw['number']=[ii[11:14] for ii in partidos_raw.filename]
partidos_raw['session']=[ii[8:10] for ii in partidos_raw.filename]
partidos_raw['legislatura']=[ii[4:6] for ii in partidos_raw.filename]
partidos_raw['speaker'] = partidos_raw['speaker'].str.lower()
partidos_raw.head()

Unnamed: 0,speaker,partido,speech,filename,number,session,legislatura
0,presidente,,"Sr.as e Srs. Deputados, ao assumir a direção d...",darl13sl01n001.txt,1,1,13
1,presidente,,Tem a palavra o Sr. Deputado Ferro Rodrigues.,darl13sl01n001.txt,1,1,13
2,presidente,,"Sr.as e Srs. Deputados, o Sr. Secretário vai p...",darl13sl01n001.txt,1,1,13
3,presidente,,"Srs. Deputados, o projeto de resolução está em...",darl13sl01n001.txt,1,1,13
4,presidente,,"Srs. Deputados, penso que não há objeções, pel...",darl13sl01n001.txt,1,1,13


In [9]:
partidos_raw['number']=pd.to_numeric(partidos_raw['number'])
partidos_raw.columns=['speaker', 'partido', 'speech', 'filename', 'number', 'session',
       'term']

In [7]:
dates_term=pd.read_csv('discursos/dates_terms.csv')

In [20]:

dates_term['number']=pd.to_numeric(dates_term['number'],errors='coerce')
dates_term=dates_term.dropna()
dates_term.dtypes

Date        object
pages        int64
term         int64
session      int64
number     float64
dtype: object

In [28]:


partidos_raw['number']=pd.to_numeric(partidos_raw['number'],errors='coerce')
partidos_raw['session']=pd.to_numeric(partidos_raw['session'],errors='coerce')
partidos_raw['term']=pd.to_numeric(partidos_raw['term'],errors='coerce')

partidos_raw1=partidos_raw[partidos_raw['number'].notna()]
partidos_raw1=partidos_raw[partidos_raw['session'].notna()]
partidos_raw1=partidos_raw[partidos_raw['term'].notna()]



partidos_raw1.dtypes

speaker     object
partido     object
speech      object
filename    object
number       int64
session      int64
term         int64
dtype: object

In [35]:
speeches_raw=partidos_raw1.merge(dates_term, on=['term','session','number'], how='left')

In [40]:
speeches_raw.columns

Index(['speaker', 'partido', 'speech', 'filename', 'number', 'session', 'term',
       'Date', 'pages'],
      dtype='object')

In [60]:
#speeches_raw['clean_speaker']=speeches_raw['speaker'].str.replace(r'[^\w\s]+', '')
speeches_raw['clean_speaker']=speeches_raw['speaker'].str.lower()
speeches_raw['clean_speaker']=speeches_raw['clean_speaker'].str.replace('ç','c')
speeches_raw['clean_speaker']=speeches_raw['clean_speaker'].str.replace('[^\w\s]', '')
import unidecode
def remove_accents(a):
    return unidecode.unidecode(a)

speeches_raw['clean_speaker'] = speeches_raw['clean_speaker'].apply(remove_accents)

In [81]:
speeches_raw['speaker_len']=speeches_raw['clean_speaker'].str.len()

In [92]:
speeches_raw=speeches_raw[speeches_raw.speaker_len<=25]
speeches_raw=speeches_raw[speeches_raw.clean_speaker!='presidente']
speeches_raw = speeches_raw[~speeches_raw['clean_speaker'].str.contains('secretari', na=False)]
speeches_raw = speeches_raw[~speeches_raw['clean_speaker'].str.contains('ministr', na=False)]
speeches_raw = speeches_raw[~speeches_raw['clean_speaker'].str.contains('deputad', na=False)]
speeches_raw = speeches_raw[~speeches_raw['clean_speaker'].str.contains('presidente', na=False)]


In [100]:
tirar deputados com cargos ('secretario'): pesquisar normes

In [103]:
speeches_raw.clean_speaker.unique()

array(['antoniocosta', 'nunomagalhaes', 'telmocorreia',
       'teresalealcoelho', 'luismontenegro', 'heloisaapolonia',
       'josemanuelpureza', 'joaooliveira', 'pedrofilipesoares',
       'ferrorodrigues', 'miguelsantos', 'josemourasoeiro',
       'teresacaeiro', 'vaniabarros', 'isabelgalricaneto',
       'joseluisferreira', 'jorgemachado', 'adaosilva',
       'paulotrigopereira', 'hugolopessoares', 'catarinamartins',
       'joanamortagua', 'brunodias', 'helenaroseta', 'carloscesar',
       'joaogalamba', 'fernandorochaandrade', 'jeronimodesousa',
       'paulateixeiradacruz', 'ceciliameireles', 'andresilva',
       'paulasantos', 'anapaulavitorino', 'pedrodelgadoalves',
       'carlosabreuamorim', 'jorgeduartecosta', 'isabelpires', 'ritarato',
       'pedronunosantos', 'helderamaral', 'jorgefalcatosimoes',
       'alvarocastelobranco', 'antoniofilipe', 'paulinoascencao',
       'franciscomendesdasilva', 'duartepacheco', 'conceicaobessaruao',
       'heitorsousa', 'duartefilipemarq

In [None]:
df['Great'] = (df["Summary"].str.lower()
                            .str.contains("great", regex=False, na=False)
                            .astype(int))

In [80]:
list_speakers=speeches_raw[speeches_raw.term==13].clean_speaker.unique()

list_speakers_len=.hist()

AttributeError: 'numpy.ndarray' object has no attribute 'hist'

In [6]:
deputies=pd.read_csv('deputies_2015_now.csv')

In [7]:
deputies=deputies[['nome', 'circulo_eleitoral', 'partido', 'legislatura']]
deputies['nome'] = deputies['nome'].str.lower()
deputies.columns=['speaker', 'circulo_eleitoral', 'partido', 'legislatura']
deputies

Unnamed: 0,speaker,circulo_eleitoral,partido,legislatura
0,carlos matias,Santarém,BE,13
1,catarina martins,Porto,BE,13
2,domicilia costa,Porto,BE,13
3,ernesto ferraz,Madeira,BE,13
4,fernando manuel barbosa,Porto,BE,13
...,...,...,...,...
570,rui silva,Braga,PSD,14
571,sandra pereira,Lisboa,PSD,14
572,sara madruga da costa,Madeira,PSD,14
573,sérgio marques,Madeira,PSD,14


In [8]:
dd=partidos_raw.merge(deputies[['speaker', 'partido']], on='speaker', how='left')
speech_selected=dd[~dd.partido_y.isna()]

In [9]:
speech_selected=speech_selected[['speaker', 'partido_x', 'speech', 'filename', 'number', 'session',
       'legislatura']]
speech_selected.columns=['speaker', 'party', 'speech', 'filename', 'number', 'session',
       'legislatura']

In [10]:
speech_selected.head()

Unnamed: 0,speaker,party,speech,filename,number,session,legislatura
1,teresa leal coelho,PSD,Muito bem!,darl13sl01n001.txt,1,1,13
2,nuno magalhães,CDS-PP,"Peço a palavra, Sr. Presidente.",darl13sl01n001.txt,1,1,13
3,nuno magalhães,CDS-PP,"Sr. Presidente, queria apenas esclarecer que n...",darl13sl01n001.txt,1,1,13
4,nuno magalhães,CDS-PP,"Sr. Presidente, Sr.as e Srs. Deputados: Não po...",darl13sl01n001.txt,1,1,13
5,nuno magalhães,CDS-PP,O Sr. Presidente falou do ex-Presidente Jaime ...,darl13sl01n001.txt,1,1,13


In [11]:
speech_selected.to_csv('speech_selected.csv')