In [1]:
#imports
import pandas as pd
from lxml import etree
import os
from bs4 import BeautifulSoup
import requests
import xml.etree.ElementTree as ET
import validators
import datetime
import re
import string
import nltk
from nltk.corpus import stopwords

In [2]:
#downloads
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('rslp')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\caperei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\caperei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\caperei\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\caperei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#parameters
characters_for_name=60
legislature='dar/01'
cycle='14'

stop_words_nltk = nltk.corpus.stopwords.words('portuguese')
#stop_words = ["the","it","she","he", "a"] #Uncomment this line if you want to use your own list of stopwords.
#The stemmers and lemmers need to be initialized before bing run
#porter = nltk.stem.porter.PorterStemmer()
snowball = nltk.stem.snowball.SnowballStemmer('portuguese')
#wordnet = nltk.stem.WordNetLemmatizer()
#RSLP=nltk.stem.RSLPStemmer()

In [4]:
#utils

def add_zeros(int_):
    if len(str(int_))<2:
        return '00'+str(int_)
    if len(str(int_))<3:
        return '0'+str(int_)
    if len(str(int_))==3:
        return str(int_)

def get_text_from_html(url__):
    from bs4 import BeautifulSoup
    res = requests.get(url__)
    html_page = res.content
    soup = BeautifulSoup(html_page, 'html.parser')
    return str(soup.noscript)

def read_input_table(current_directory="C:\\Users\caperei",cycle='14',session='1' ):

    data_directory=f"\portuguese_open_data\data\cycle\{cycle}\session\{session}\\numbers_dates_pages.csv"
    input_data=pd.read_csv(r"C:\Users\caperei\portuguese_open_data\data\cyle\14\session\1\numbers_dates_pages.csv")
    return input_data

In [5]:
#regex
def check_dialog(text_):
    pat="(?:</p><p>.*?\(.*?\): —)"
    match=re.findall(pat, text_)
    if match is None :
        return 'no_dialog'
    else:
        for m in match:
            
            return m

def extract_dialog(full_text,characters_for_name):    
    if full_text is None:
        return None
    else:
        dialogs=[(m.start(0), m.end(0)) for m in re.finditer('</p><p></p><p>(.+?): —', full_text)]
        out=[]
        if len(dialogs)==1:
            out.append(full_text[dialogs[0][1]-characters_for_name:])
        if len(dialogs)>1:
            for i in range(0, len(dialogs)-1):
                out.append(full_text[dialogs[i][1]-characters_for_name:dialogs[i+1][0]])
            out.append(full_text[dialogs[len(dialogs)-1][1]-50:])
        return out

    
def extract_party_name(dialogs):
    if dialogs is None:
        return None
    else:
        res=[]
        for i in dialogs:
                    positions=[(m.start(0), m.end(0)) for m in re.finditer('\((.+?)\): —', i)]
                    if len(positions)==0:
                        res.append(['No','No'])
                    else:
                        party=re.findall('\((.+?)\)',i)
                        name_aux=i[positions[0][1]-50:positions[0][1]]
                        name=re.sub(r'\b\w{1,2}\b', '', name_aux).replace('.','').replace ('()','').replace('  ','').replace('<','').replace('>','').replace('/','').replace(': —','')
                        if len(party)>0:
                            party=party[0]
                        res.append([party,name])
        return res

def add_speech_next_page(df):
    for pi in range(1,df.page.max()+1):
        if len(df[df.page==pi].speech.values[0])>0:
            speeches=df[df.page==pi].speech.values[0]
            if '</noscript>' in speeches[-1]:
                
                for n in range(1,df.page.max()-pi):
                    if ': —' in df[df.page==pi+n].text_1.values[0]:
                        
                        in_=df[df.page==pi+n].text_1.values[0].find(': —')
                        df[df.page==pi].speech.values[0][-1]=df[df.page==pi].speech.values[0][-1]+' '+str(df[df.page==pi+n].text_1.values[0][0:in_])
                        
                        break
                    else:
                        
                        df[df.page==pi].speech.values[0][-1]=df[df.page==pi].speech.values[0][-1]+' '+str(df[df.page==pi+n].text_1.values[0])
    return df


#defining the function to remove punctuation
def remove_punctuation(text):
  if(type(text)==float):
    return text
  ans=""  
  for i in text:     
    if i not in string.punctuation+'<p></p>ºª':
      ans+=i    
  return ans

def normlizeTokens(tokenLst, stopwordLst = None, stemmer = None):
    #We can use a generator here as we just need to iterate over it

    #Lowering the case and removing non-words
    workingIter = (w.lower() for w in tokenLst if w.isalpha())

    #Now we can use the semmer, if provided
    if stemmer is not None:
        workingIter = (stemmer.stem(w) for w in workingIter)
         
    #And remove the stopwords
    if stopwordLst is not None:
        workingIter = (w for w in workingIter if w not in stopwordLst)
    #We will return a list with the stopwords removed
    return list(workingIter)

def generate_N_grams(text,ngram=1):
  words=[word for word in text]  
  temp=zip(*[words[i:] for i in range(0,ngram)])
  ans=[' '.join(ngram) for ngram in temp]
  return ans

In [6]:
#functions scrapping
def build_transcripts_table_from_html(session=['01'],legislature='dar/01', cycle='14'):
    df = pd.DataFrame(columns=['legislature','cycle','session','number','date','page', 'text_1','url'])        
    #session=['01'] #['01','02','03']
    numbers_dates=list(zip(input_data.number, input_data.date,input_data.pages )) #[0:1]
    pages=input_data.pages.values #[0:1]
    s_c=[]
    n_c=[]
    d_c=[]
    p_c=[]
    u_c=[]
    t_c=[]
    for s in session:
        for number, date, page in numbers_dates:
            number=add_zeros(number)
            date=datetime.datetime.strptime(date, '%d/%m/%Y').strftime('%Y-%m-%d')
            for page_i in range(1,page+1):
                    url_=f'https://debates.parlamento.pt/catalogo/r3/{legislature}/{cycle}/{s}/{number}/{date}/{page_i}'
                    if validators.url(url_):
                        #print (url_)
                        u_c.append(url_)
                        p_c.append(page_i)
                        t_c.append(get_text_from_html(url_))
                        s_c.append(s)
                        n_c.append(number)
                        d_c.append(date)
    df['page']=p_c
    df['session']=s_c
    df['number']=n_c
    df['date']=d_c

    df['text_1']=t_c
    df['url']=u_c
    df['legislature']=legislature
    df['cycle']=cycle    
    return df

def add_party_speaker(df,characters_for_name ):
    #add columns
    df['speech']=[extract_dialog(t,characters_for_name) for t in df.text_1 ]
    df['party']=[extract_party_name(t) for t in df.speech ]
    df=add_speech_next_page(df)
    #re arrange by speaker and party
    df1=df.explode(['speech', 'party']).reset_index(drop=True)
    df2=df1.dropna(subset=['party'])
    df2['party_s']=[t[0] for t in df2.party]
    df2['speaker']=[t[1] for t in df2.party]
    df2=df2[df2.party_s!='No']
    return df

In [7]:
input_data=read_input_table()
input_data.head(3)

Unnamed: 0,number,date,pages
0,1,25/10/2019,13
1,2,30/10/2019,124
2,3,31/10/2019,26


In [8]:
#=build_transcripts_table_from_html()
#df.to_pickle('portuguese_transcripts_s1.pkl') 
df = pd.read_pickle('portuguese_transcripts_s1.pkl')

In [9]:
df.head()

Unnamed: 0,legislature,cycle,session,number,date,page,text_1,url,speech,party
0,dar/01,14,1,1,2019-10-25,1,"<noscript>\n<p>Sábado, 26 de outubro de 2019 ...",https://debates.parlamento.pt/catalogo/r3/dar/...,[],[]
1,dar/01,14,1,1,2019-10-25,2,<noscript>\n<p>I SÉRIE — NÚMERO 1 </p><p></p><...,https://debates.parlamento.pt/catalogo/r3/dar/...,[es.</p><p></p><p>A Sr.ª Ana Catarina Mendonça...,"[[PS, Ana Catarina Mendonça Mendes ], [No, No]..."
2,dar/01,14,1,1,2019-10-25,3,<noscript>\n<p>26 DE OUTUBRO DE 2019 </p><p>3 ...,https://debates.parlamento.pt/catalogo/r3/dar/...,"[er votado, Sr. Presidente. </p><p></p><p>O Sr...","[[No, No], [PS, Pedro Delgado Alves ]]"
3,dar/01,14,1,1,2019-10-25,4,<noscript>\n<p>I SÉRIE — NÚMERO 1 </p><p></p><...,https://debates.parlamento.pt/catalogo/r3/dar/...,[],[]
4,dar/01,14,1,1,2019-10-25,5,<noscript>\n<p>26 DE OUTUBRO DE 2019 </p><p>5 ...,https://debates.parlamento.pt/catalogo/r3/dar/...,"[residente, muito obrigado. </p><p></p><p>O Sr...","[[No, No]]"


In [10]:
df2=add_party_speaker(df,characters_for_name )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['party_s']=[t[0] for t in df2.party]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['speaker']=[t[1] for t in df2.party]


In [11]:
df2['speech1']= df2['speech'].apply(lambda x:remove_punctuation(x))
df2['tokenized_text'] = df2['speech1'].apply(lambda x: nltk.word_tokenize(x))
df2['normalized_tokens'] = df2['tokenized_text'].apply(lambda x: normlizeTokens(x, stopwordLst = stop_words_nltk, stemmer = snowball))
df2['normalized_tokens_count'] = df2['normalized_tokens'].apply(lambda x: len(x))
df2['uni_grams'] = df2['normalized_tokens'].apply(lambda x: generate_N_grams(x,1))
df2['bi_grams'] = df2['normalized_tokens'].apply(lambda x: generate_N_grams(x,2))
df2['tri_grams'] = df2['normalized_tokens'].apply(lambda x: generate_N_grams(x,3))

In [15]:
df2.head()

Unnamed: 0,legislature,cycle,session,number,date,page,text_1,url,speech,party,speech1,tokenized_text,normalized_tokens,normalized_tokens_count,uni_grams,bi_grams,tri_grams
0,dar/01,14,1,1,2019-10-25,1,"<noscript>\n<p>Sábado, 26 de outubro de 2019 ...",https://debates.parlamento.pt/catalogo/r3/dar/...,[],[],,[],[],0,[],[],[]
1,dar/01,14,1,1,2019-10-25,2,<noscript>\n<p>I SÉRIE — NÚMERO 1 </p><p></p><...,https://debates.parlamento.pt/catalogo/r3/dar/...,[es.</p><p></p><p>A Sr.ª Ana Catarina Mendonça...,"[[PS, Ana Catarina Mendonça Mendes ], [No, No]...",es.</p><p></p><p>A Sr.ª Ana Catarina Mendonça ...,"[es., <, /p, >, <, p, >, <, /p, >, <, p, >, A,...","[p, p, ana, catarin, mendonc, mend, ps, srs, d...",412,"[p, p, ana, catarin, mendonc, mend, ps, srs, d...","[p p, p ana, ana catarin, catarin mendonc, men...","[p p ana, p ana catarin, ana catarin mendonc, ..."
2,dar/01,14,1,1,2019-10-25,3,<noscript>\n<p>26 DE OUTUBRO DE 2019 </p><p>3 ...,https://debates.parlamento.pt/catalogo/r3/dar/...,"[er votado, Sr. Presidente. </p><p></p><p>O Sr...","[[No, No], [PS, Pedro Delgado Alves ]]","er votado, Sr. Presidente. </p><p></p><p>O Sr....","[er, votado, ,, Sr., Presidente, ., <, /p, >, ...","[er, vot, president, p, p, president, muit, ob...",755,"[er, vot, president, p, p, president, muit, ob...","[er vot, vot president, president p, p p, p pr...","[er vot president, vot president p, president ..."
3,dar/01,14,1,1,2019-10-25,4,<noscript>\n<p>I SÉRIE — NÚMERO 1 </p><p></p><...,https://debates.parlamento.pt/catalogo/r3/dar/...,[],[],,[],[],0,[],[],[]
4,dar/01,14,1,1,2019-10-25,5,<noscript>\n<p>26 DE OUTUBRO DE 2019 </p><p>5 ...,https://debates.parlamento.pt/catalogo/r3/dar/...,"[residente, muito obrigado. </p><p></p><p>O Sr...","[[No, No]]","residente, muito obrigado. </p><p></p><p>O Sr....","[residente, ,, muito, obrigado, ., <, /p, >, <...","[resident, muit, obrig, p, p, president, muit,...",273,"[resident, muit, obrig, p, p, president, muit,...","[resident muit, muit obrig, obrig p, p p, p pr...","[resident muit obrig, muit obrig p, obrig p p,..."
