In [1]:
#import packages
import os
import glob
import pandas as pd
import re

In [2]:
#Create dataframe from parliament-txt transcripts
# Run 1 time only
#dialogsDF
dialogsDF = pd.DataFrame()

#regex expressions to extract speakers
SPEAKER_REGEX = re.compile(r'[A|O] Sr\.ª?( [\w\-,]+)+([\-\w]+)?( \([\w\s\.,\-]+\))*(?=:)')
MULTILINE_SPEAKER_REGEX = re.compile("^" + SPEAKER_REGEX.pattern, flags=re.M) 
c=0
for file in os.scandir("./discursos"):
    speech_by_speaker = []

    with open(file, encoding="utf8") as fp:
        content = fp.read()
    
        #1)extract dates and numbers

        dates_txt = re.compile("\d{1,2} [A-Z]+ [A-Z]+ [A-Z]+ \d{4} (\n\n\d+)?")
        dates_txt.sub(" ", content)

        series_numb = re.compile("[A-Z]+ SÉRIE \W NÚMERO \d+ (\n+\d+)?")
        series_numb.sub(" ", content)

        deputies = MULTILINE_SPEAKER_REGEX.finditer(content) 
        deputies = [match.group() for match in deputies]
        deputies = list(set(deputies))

        content_on_one_line = ''.join(content.splitlines())

        for speaker in deputies:
            content_split_by_speaker = content_on_one_line.split(speaker + ':')
            # Get rid of everything before they speak
            content_split_by_speaker.pop(0)

            # what_they_say = [speech for speech in speaker_regex]
            # for each line in content_split_by_speaker
            # use the speaker regex to find the next speaker and get everything before that
            for line in content_split_by_speaker:
                start_of_next_speaker = SPEAKER_REGEX.finditer(line)

                # either an index or None
                # if it's None, either:
                # - the regex isn't accurate enough; or
                # - the speaker is the only person to speak]
                next_speaker_indexes = [match.start() for match in start_of_next_speaker]
                next_speaker_index = next_speaker_indexes[0] if len(next_speaker_indexes) > 0 else None

                speech = line[0:next_speaker_index] if next_speaker_index is not None else line
                speech_by_speaker.append((speaker, speech))

                # if "):" in speech:
                #     print(speech)
        
        df = pd.DataFrame.from_records(speech_by_speaker, columns=["speaker", "speech"])#.reset_index()
        df['filename'] = os.path.basename(file)
        #print(df)
                
        dialogsDF = pd.concat([dialogsDF, df])
dialogsDF.to_pickle('dialogsDF.pkl')  
        
#conjunto_dados#.to_csv("deputados.csv")#, index=False)
    

In [37]:
# load dataframe with speaches
dialogsDF = pd.read_pickle('dialogsDF.pkl')  
#see it
print(dialogsDF.shape)
dialogsDF.tail(3)

(151340, 3)


Unnamed: 0,speaker,speech,filename
114,O Sr. Pedro Filipe Soares (BE),— Muito bem!,darl14sl03n031.txt
115,O Sr. Pedro Filipe Soares (BE),— Exatamente!,darl14sl03n031.txt
116,O Sr. Afonso Oliveira (PSD),— Isso é conversa!,darl14sl03n031.txt


In [4]:
def remove_punctuation(col_name, df):
    df[col_name]=df[col_name].str.replace('[^\w\s]', '')
    print('Punctuation removed')
    return df

def remove_special_char(col_name, df, char1,char2):
    df[col_name]=df[col_name].str.replace(char1,char2)
    print('Character '+char1 + 'replaced by '+ char2)
    return df

def aux_remove_accents(a):
    import unidecode
    return unidecode.unidecode(a)

def remove_accents(col_name, df ):
    df[col_name] = df[col_name].apply(aux_remove_accents)
    print('Accents removed')
    return df

def date_format(df, date_col, old_format="%d/%m/%Y", new_format="%Y-%m-%Y"):
    import datetime
    df[date_col] = pd.to_datetime(df[date_col], format =old_format)
    df[date_col] = pd.to_datetime(df[date_col], format =new_format)
    df[date_col]= df['Date'].dt.date
    return df

In [38]:
# 1 Speakers selection
#remove ministers, secretaries, presidents' dialogs
dialogsDF['clean_speaker']=dialogsDF['speaker'].str.lower()
dialogsDF= remove_special_char('clean_speaker', dialogsDF, 'ç','c') #dialogsDF['clean_speaker'].str.replace('ç','c')
dialogsDF= remove_punctuation('clean_speaker', dialogsDF) #dialogsDF['clean_speaker'].str.replace('[^\w\s]', '')
dialogsDF= remove_accents('clean_speaker', dialogsDF)
dialogsDF = dialogsDF[~dialogsDF['clean_speaker'].str.contains('secretari*', na=False)]
dialogsDF = dialogsDF[~dialogsDF['clean_speaker'].str.contains('ministr*', na=False)]
dialogsDF = dialogsDF[~dialogsDF['clean_speaker'].str.contains('presidente*', na=False)]
dialogsDF.drop('clean_speaker',  axis=1, inplace=True)
# 1.1 Speakers cleaning name
aux = dialogsDF["speaker"].str.split("(", n=1, expand=True)
aux.columns = ["speaker", "party"]
dialogsDF["speaker"] = aux["speaker"].str.replace(r"[A|O] Sr\.ª? ", "")
dialogsDF["speaker"] = dialogsDF["speaker"].str.rstrip(" ")
dialogsDF['speaker'] = dialogsDF['speaker'].str.lower()
dialogsDF= remove_special_char('speaker', dialogsDF, 'ç','c') #dialogsDF['clean_speaker'].str.replace('ç','c')
dialogsDF= remove_punctuation('speaker', dialogsDF) #dialogsDF['clean_speaker'].str.replace('[^\w\s]', '')
dialogsDF= remove_accents('speaker', dialogsDF)

#2 speach data cleaning
dialogsDF["speech"] = [string.lstrip("— ") for string in dialogsDF["speech"]]
dialogsDF['speech'] = dialogsDF['speech'].str.lower()
dialogsDF= remove_special_char('speech', dialogsDF, 'ç','c') #dialogsDF['clean_speaker'].str.replace('ç','c')
dialogsDF= remove_punctuation('speech', dialogsDF) #dialogsDF['clean_speaker'].str.replace('[^\w\s]', '')
dialogsDF= remove_accents('speech', dialogsDF)

# 3 creation of number columns
dialogsDF['number']=[ii[11:14] for ii in dialogsDF.filename]
dialogsDF['number']=pd.to_numeric(dialogsDF['number'],errors='coerce')
dialogsDF['session']=[ii[8:10] for ii in dialogsDF.filename]
dialogsDF['session']=pd.to_numeric(dialogsDF['session'],errors='coerce')
dialogsDF['term']=[ii[4:6] for ii in dialogsDF.filename]
dialogsDF['term']=pd.to_numeric(dialogsDF['term'],errors='coerce')

# 4 creation of party column
dialogsDF["party"] = aux["party"].str.rstrip(")")
# 5 creation of dates column for each parliament session 
dates_term=pd.read_csv('dates_terms.csv')
dates_term['number']=pd.to_numeric(dates_term['number'],errors='coerce')
dates_term=dates_term.dropna()
dialogsDF=dialogsDF.merge(dates_term, on=['term','session','number'], how='left')
# 6 date format
dialogsDF=date_format(dialogsDF, 'Date')
# 7 creation of link 
strnumb = lambda x : '00'+str(x) if(x <10) else ('0'+str(x) if x < 100 else x) 
dialogsDF['link_numb']=dialogsDF.number.apply( strnumb)
dialogsDF['link']='https://debates.parlamento.pt/catalogo/r3/dar/01/'+dialogsDF.term.map(str) +'/0'+ dialogsDF.session.map(str)+'/'+ dialogsDF.link_numb.map(str) +'/'+ dialogsDF.Date.map(str)
dialogsDF.drop('link_numb',  axis=1, inplace=True)

Character çreplaced by c


  df[col_name]=df[col_name].str.replace('[^\w\s]', '')


Punctuation removed
Accents removed


  dialogsDF["speaker"] = aux["speaker"].str.replace(r"[A|O] Sr\.ª? ", "")


Character çreplaced by c


  df[col_name]=df[col_name].str.replace('[^\w\s]', '')


Punctuation removed
Accents removed
Character çreplaced by c
Punctuation removed


In [25]:
dialogsDF.head()

Unnamed: 0,speaker,speech,filename,number,session,term,party,Date,link
0,teresa leal coelho,muito bem,darl13sl01n001.txt,1,1,13,PSD,2015-10-23,https://debates.parlamento.pt/catalogo/r3/dar/...
1,telmo correia,era uma coligacao,darl13sl01n001.txt,1,1,13,CDS-PP,2015-10-23,https://debates.parlamento.pt/catalogo/r3/dar/...
2,nuno magalhaes,peco a palavra sr presidente,darl13sl01n001.txt,1,1,13,CDS-PP,2015-10-23,https://debates.parlamento.pt/catalogo/r3/dar/...
3,nuno magalhaes,sr presidente queria apenas esclarecer que nao...,darl13sl01n001.txt,1,1,13,CDS-PP,2015-10-23,https://debates.parlamento.pt/catalogo/r3/dar/...
4,nuno magalhaes,sr presidente sras e srs deputados nao posso n...,darl13sl01n001.txt,1,1,13,CDS-PP,2015-10-23,https://debates.parlamento.pt/catalogo/r3/dar/...


In [None]:
# 8 selection of parties

dialogsDF.loc[dialogsDF["party"] == "PEN", "party"] = 'PAN'

dialogsDF.loc[dialogsDF["party"] == "OS Verdes", "party"] = 'PEV'
dialogsDF.loc[dialogsDF["party"] == "Os verdes", "party"] = 'PEV'
dialogsDF.loc[dialogsDF["party"] == "Os Vedes", "party"] = 'PEV'
dialogsDF.loc[dialogsDF["party"] == "Os Verdes", "party"] = 'PEV'
dialogsDF.loc[dialogsDF["party"] == "s Verdes", "party"] = 'PEV'

dialogsDF.loc[dialogsDF["party"] == "SD", "party"] = 'PSD'
dialogsDF.loc[dialogsDF["party"] == "PD", "party"] = 'PSD'

dialogsDF.loc[dialogsDF["party"] == "CDS", "party"] = 'CDS-PP'
dialogsDF.loc[dialogsDF["party"] == "CSD-PP", "party"] = 'CDS-PP'
dialogsDF.loc[dialogsDF["party"] == "CDS-P", "party"] = 'CDS-PP'

dialogsDF.loc[dialogsDF["party"] == "PC", "party"] = 'PCP'

dialogsDF.loc[dialogsDF["party"] == "B E", "party"] = 'BE'
dialogsDF.loc[dialogsDF["party"] == "Bloco de Esquerda", "party"] = 'BE'

dialogsDF.loc[dialogsDF["party"] == "Partido Socialista", "party"] = 'PS'

dialogsDF.loc[dialogsDF["party"] == "N insc.", "party"] = "N insc"
dialogsDF.loc[dialogsDF["party"] == "N Insc.", "party"] = "N insc"
dialogsDF.loc[dialogsDF["party"] == "Ninsc.", "party"] = "N insc"
dialogsDF.loc[dialogsDF["party"] == "N. insc.", "party"] = "N insc"
dialogsDF.loc[dialogsDF["party"] == "Não insc.", "party"] = "N insc"

In [None]:
dialogsDF['speaker'] = dialogsDF['speaker'].str.replace('^a ', '')
dialogsDF['speaker'] = dialogsDF['speaker'].str.replace(' e ', '')
dialogsDF['speaker'] = dialogsDF['speaker'].str.replace(' ', '')

In [27]:
dialogsDF.party.unique()
dialogsDF = dialogsDF[dialogsDF.party!='N insc']

In [28]:
dialogsDF.party.unique()

array(['PSD', 'CDS-PP', 'PS', 'PCP', 'BE', 'PEV', 'PAN', None, 'L', 'IL',
       'CH'], dtype=object)

In [36]:
#list unica de nomes e parties bem, merge e criar coluna de party final (paulo portas cds e paulo portas none)
dialogsDF.groupby(['speaker','party']).size() #.to_csv('fix_deputies.csv')

speaker         party 
abelbaptista    CDS-PP      95
adaosilva       PSD       1605
adaosiva        PSD          2
afonsooliveira  PS           1
                PSD        325
                          ... 
verbraz         PS          44
verprata        PCP          4
vitalinocanas   PS          67
wandaguimaraes  PS          22
wandguimaraes   PS         193
Length: 649, dtype: int64

In [20]:
dialogsDF[dialogsDF.party.isnull()]

Unnamed: 0,speaker,speech,filename,number,session,term,party,Date,link
1605,paulo portas,Já é um coletivo!,darl13sl01n012.txt,12,1,13,,2015-12-02,https://debates.parlamento.pt/catalogo/r3/dar/...
1606,paulo portas,Mas isso o PCP não acha!,darl13sl01n012.txt,12,1,13,,2015-12-02,https://debates.parlamento.pt/catalogo/r3/dar/...
1660,carlos césar,"Bastará, parafraseando o líder parlamentar do ...",darl13sl01n012.txt,12,1,13,,2015-12-02,https://debates.parlamento.pt/catalogo/r3/dar/...
2072,deputado está a fazer uma única tentativa,manter o poder dos sindicatos. Isso é muito po...,darl13sl01n014.txt,14,1,13,,2015-12-09,https://debates.parlamento.pt/catalogo/r3/dar/...
2139,deputado diz assim,«O que está aqui em causa é uma determinada in...,darl13sl01n015.txt,15,1,13,,2015-12-10,https://debates.parlamento.pt/catalogo/r3/dar/...
...,...,...,...,...,...,...,...,...,...
90037,"deputado nessa altura disse, e cito",«Encerrar consulados por vezes é um caminho de...,darl14sl03n011.txt,11,3,14,,2021-10-14,https://debates.parlamento.pt/catalogo/r3/dar/...
90479,deputado pergunta,«Porquê agora?». Sr. Deputado Cotrim de Figuei...,darl14sl03n015.txt,15,3,14,,2021-10-22,https://debates.parlamento.pt/catalogo/r3/dar/...
91254,"deputado, e bem, diz",«Nós não cedemos à chantagem!» Muito bem! Mas ...,darl14sl03n018.txt,18,3,14,,2021-11-02,https://debates.parlamento.pt/catalogo/r3/dar/...
91624,deputado diz,«O Iniciativa Liberal parece presciente; a 5 d...,darl14sl03n021.txt,21,3,14,,2021-11-10,https://debates.parlamento.pt/catalogo/r3/dar/...
