In [1]:
create_tokenizedDF= True
import pandas as pd
from collections import Counter
import nltk

stop_words_nltk = nltk.corpus.stopwords.words('portuguese')
snowball = nltk.stem.snowball.SnowballStemmer('portuguese')

In [2]:
def normlizeTokens(tokenLst, stopwordLst = None, stemmer = None):
    #We can use a generator here as we just need to iterate over it

    #Lowering the case and removing non-words
    workingIter = (w.lower() for w in tokenLst if w.isalpha())

    #Now we can use the semmer, if provided
    if stemmer is not None:
        workingIter = (stemmer.stem(w) for w in workingIter)
         
    #And remove the stopwords
    if stopwordLst is not None:
        workingIter = (w for w in workingIter if w not in stopwordLst)
    #We will return a list with the stopwords removed
    return list(workingIter)

def create_tokenized_dataset(df_input):
    # 1 create token column: tokens
    df_input['tokens']=df_input['speech'].apply(lambda x: nltk.word_tokenize(x))
    # 2 create token without stopwords and stemmer: tokens_stemer_stop. 16min
    df_input['tokens_stemer_stop'] = df_input['tokens'].apply(lambda x: normlizeTokens(x, stopwordLst = stop_words_nltk, stemmer = snowball))
    
    # 3 extract very frequent or rare words: token_cleaned
    flat_tokens=[item for sublist in df_input['tokens_stemer_stop'] for item in sublist]
    aux_c=Counter(flat_tokens)
        
    extrat_common=['par', 'nao', 'sr', 'deput', 'govern', 'muit', 'pel', 'president','tod','tamb','srs','sras','pod','part','psd','sao','aplaus','ja','porqu','faz','ha','diz','quer','pais','sobr','bem','nest']
    extract_rare=[x[0] for x in aux_c.most_common()[-30:]]

    df_input['tokens_cleaned'] = df_input['tokens_stemer_stop'].apply(lambda x : [i for i in x if not i in extrat_common])
    df_input['tokens_cleaned'] = df_input['tokens_stemer_stop'].apply(lambda x : [i for i in x if not i in extract_rare])
    df_input.to_pickel('df_input.pkl')
    return df_input

def generate_N_grams(text,ngram=1):
  words=[word for word in text]  
  temp=zip(*[words[i:] for i in range(0,ngram)])
  ans=[' '.join(ngram) for ngram in temp]
  return ans

def create_grams(df_frame, n):
    grams_d={1: 'uni_grams', 2: 'bi_grams', 3: 'tri_grams'}
    df_frame[grams_d[n]] = df_frame['tokens_cleaned'].apply(lambda x: generate_N_grams(x,n))
    return df_frame 

In [3]:
if create_tokenizedDF==True:
    df_input = pd.read_pickle('parliament_fdf.pkl')  
    df_tok=create_tokenized_dataset(df_input)
else:
    df_tok = pd.read_pickle('df_input.pkl')  

In [7]:
df_tok=create_grams(df_tok, 2)
df_tok=create_grams(df_tok, 3)

In [8]:
df_tok.head(3)

Unnamed: 0,speaker,speech,filename,number,session,term,Date,link,party,speaker_ntime,genre,len_speech,year,tokens,tokens_stemer_stop,tokens_cleaned,bi_grams,tri_grams
0,josemanuelpureza,muito bem,darl13sl01n001.txt,1,1,13,2015-10-23,https://debates.parlamento.pt/catalogo/r3/dar/...,BE,891.0,M,2,2015,"[muito, bem]",[],[],[],[]
1,teresalealcoelho,muito bem,darl13sl01n001.txt,1,1,13,2015-10-23,https://debates.parlamento.pt/catalogo/r3/dar/...,PSD,87.0,F,2,2015,"[muito, bem]",[],[],[],[]
2,luismontenegro,muito bom dia a todos sras e srs deputados cum...,darl13sl01n001.txt,1,1,13,2015-10-23,https://debates.parlamento.pt/catalogo/r3/dar/...,PSD,694.0,M,204,2015,"[muito, bom, dia, a, todos, sras, e, srs, depu...","[bom, dia, sras, cumpr, prax, parlament, cabem...","[bom, dia, sras, cumpr, prax, parlament, cabem...","[bom dia, dia sras, sras cumpr, cumpr prax, pr...","[bom dia sras, dia sras cumpr, sras cumpr prax..."


In [None]:
#political polarization functions

right=[ 'PSD',  'CDS-PP', 'CH','IL','CDS']
left=[ 'PS', 'BE', 'PCP', 'PAN', 'PEV','L']


def create_frequency_table_grams(n_gram=1, indf=None, right_parties=right, left_parties=left):
    grams_d={1: 'uni_grams', 2: 'bi_grams', 3: 'tri_grams'}
   

    r_grams=[item for sublist in indf[indf.party.isin(right)][grams_d[n_gram]] for item in sublist]
    l_grams=[item for sublist in indf[indf.party.isin(left)][grams_d[n_gram]] for item in sublist]

    total_counter = Counter([item for sublist in indf[grams_d[n_gram]] for item in sublist])
    right_counter = Counter(r_grams)
    left_counter = Counter(l_grams)

    df_all = pd.DataFrame.from_dict(total_counter, orient='index').reset_index()
    df_all.columns=['phrase','count']
    df_all['count_right']=[right_counter[x] for x in df_all.phrase]
    df_all['count_left']=[left_counter[x] for x in df_all.phrase]


    df_all['count_left_total']=sum(left_counter.values())
    df_all['count_right_total']=sum(right_counter.values())
    # frequency of every words except the given one (by row)
    #Jensen et al. (2012),p.10 
    #f-pck is the frequency of all phrases used in Con­gress c by party k excluding phrase p
    df_all['f_right_minus']=( df_all['count_right_total']-df_all['count_right'])/df_all['count_right_total']
    df_all['f_left_minus']=( df_all['count_left_total']-df_all['count_left'])/df_all['count_left_total']

    df_all['f_right']=df_all['count_right']/df_all['count_right_total']
    df_all['f_right_norm'] = (df_all['f_right'] - df_all['f_right'].min()) / (df_all['f_right'].max() - df_all['f_right'].min())  
    df_all['f_left']=df_all['count_left']/df_all['count_left_total']
    df_all['f_left_norm'] = (df_all['f_left'] - df_all['f_left'].min()) / (df_all['f_left'].max() - df_all['f_left'].min())  

    df_all['f_right_minus_norm']=(df_all['f_right_minus'] - df_all['f_right_minus'].min()) / (df_all['f_right_minus'].max() - df_all['f_right_minus'].min()) 
    df_all['f_left_minus_norm']=(df_all['f_left_minus'] - df_all['f_left_minus'].min()) / (df_all['f_left_minus'].max() - df_all['f_left_minus'].min()) 

    return df_all


def calculate_pearson(df_all):
    aa=df_all['f_right_norm']*df_all['f_left_minus_norm'] 
    bb=df_all['f_left_norm']*df_all['f_right_minus_norm']
    cc=aa-bb
    dd=cc*cc
    d11=df_all['f_right_norm']+df_all['f_left_norm']
    d22=df_all['f_right_norm']+df_all['f_right_minus_norm']
    d33=df_all['f_left_norm']+df_all['f_left_minus_norm']
    d44=df_all['f_right_minus_norm']+df_all['f_left_minus_norm']
    denom=d11*d22*d33*d44
    pp=dd/denom
    return pp

def create_phrase_partisanship(df):
    aa=df['f_right_norm']+df['f_left_norm']
    df['rho']=df['f_right_norm']/aa
    bb=1-df['f_right_norm']
    df['f_left_norm_scaled']=df['f_right_norm']/bb
    cc=1-df['f_left_norm']
    df['f_right_norm_scaled']=df['f_left_norm']/cc
    df['pi']=df['f_right_norm']*df['rho']
    df['pi_scaled']=(df['pi']/(1-df['f_right_norm']))+((1-df['pi'])/(1-df['f_left_norm']))

    df['gram_partisanship']= 0.5 * (1 - df['pi_scaled'] + 
                           (df['f_right_norm_scaled'] +  df['f_left_norm_scaled']) * df['rho'])
    return df

def create_polarization_correlation(df):
    
    aa=df['f_left_norm']*-1
    bb= df['f_right_norm']*1
    df['beta_polarization']=aa+bb

    return df

In [None]:
def normlizeTokens(tokenLst, stopwordLst = None, stemmer = None):
    #We can use a generator here as we just need to iterate over it

    #Lowering the case and removing non-words
    workingIter = (w.lower() for w in tokenLst if w.isalpha())

    #Now we can use the semmer, if provided
    if stemmer is not None:
        workingIter = (stemmer.stem(w) for w in workingIter)
         
    #And remove the stopwords
    if stopwordLst is not None:
        workingIter = (w for w in workingIter if w not in stopwordLst)
    #We will return a list with the stopwords removed
    return list(workingIter)

def generate_N_grams(text,ngram=1):
  words=[word for word in text]  
  temp=zip(*[words[i:] for i in range(0,ngram)])
  ans=[' '.join(ngram) for ngram in temp]
  return ans

def create_tokens(df_frame):
    nltk.download('stopwords')
    stop_words_nltk = nltk.corpus.stopwords.words('portuguese')

    df_frame['tokenized_text'] = df_frame['speech1'].apply(lambda x: nltk.word_tokenize(x))
    df_frame['normalized_tokens'] = df_frame['tokenized_text'].apply(lambda x: normlizeTokens(x, stopwordLst = stop_words_nltk, stemmer = None))
    df_frame['normalized_tokens_count'] = df_frame['normalized_tokens'].apply(lambda x: len(x))
    return df_frame

def create_grams(df_frame, n):
    df_frame['tri_grams'] = df_frame['normalized_tokens'].apply(lambda x: generate_N_grams(x,n))
    return df_frame

def apply_polarization_model(declaracoes2,right,left):
    dfg3=create_frequency_table_grams(n_gram=3, indf=declaracoes2, right_parties=right, left_parties=left)
    dfg3['pearson_quad']=calculate_pearson(dfg3)
    trigrams_table=dfg3[dfg3.pearson_quad>0]
    trigrams_table=create_phrase_partisanship(trigrams_table)
    trigrams_table=create_polarization_correlation(trigrams_table)
    final_df = trigrams_table.sort_values(by=['gram_partisanship'], ascending=False)
    return final_df

def create_200r(df_mainf):
    indexes_no_applause=[n for n,x in enumerate(df_mainf.phrase) if 'aplausos' not in x]
    df_mainf=df_mainf.iloc[indexes_no_applause]

    tri_final = df_mainf[0:200].append(df_mainf[-200:], ignore_index=True)
    return tri_final