In [1]:
import pandas as pd
import numpy as np
import datetime
import re
import os
from gensim.models.word2vec import Word2Vec
import random as rn

In [2]:
df = pd.read_csv('../out_files/tell_all_cleaned.csv')

print(df.shape)
df = df[df['speech'].notna()]
print(df.shape)
df.speech.iloc[0]
df.sitting_date = pd.to_datetime(df.sitting_date, format="%d/%m/%Y") 
df.speech.head(4)

(1280918, 12)
(1274505, 12)


0    παρακαλειται @sw γραμματεας βουλγαρακης @sw συ...
1    παρακαλειται @sw κυριος γραμματεας @sw συνοδευ...
2    κυριοι συναδελφοι παρακαλω @sw βουλη @sw εξουσ...
3                                              @sw @sw
Name: speech, dtype: object

In [3]:
df.speech = df.speech.apply(lambda x: x.replace(".", " . ")) #add space around dot
df.speech = df.speech+' . '

#concat sentences, each last sentence for each speech did not have dot so add one.
print('Preparing data...')
'''
cade tool uses gensim.models.word2vec.LineSentence() to iterate over the training corpus
gensim.models.word2vec.LineSentence() takes as input 
a file that contains sentences: one line = one sentence.
Words must be already preprocessed and separated by whitespace'''
df.speech = df.speech.apply(lambda x: x.replace('\n', ' '))
df.speech = df.speech.apply(lambda x: re.sub('\s\s+' , ' ', x)) 
df.speech = df.speech.apply(lambda x: x.replace('. ', '.\n'))

df = df.rename(columns={'parliamentary_period': 'period'})

# Adjust period names, merge small periods with larger and remove words in order to easily sort later on
df.period = df.period.apply(lambda x: x.replace(' review 9',''))
df.period = df.period.apply(lambda x: x.replace('period ',''))
df.period = df.period.astype(int)
df.loc[(df.period==5), 'period'] = 7
df.loc[(df.period==6), 'period'] = 7
df.loc[(df.period==14), 'period'] = 15 #2012-2014
df.loc[(df.period==16), 'period'] = 17 #2015-2019

print('Group by periods...')
PERperiod_df = df.groupby(df.period)['speech'].apply(''.join).reset_index() #concat sentences, each last sentence for each speech did not have dot so add one.


Preparing data...
Group by periods...


In [4]:
PERperiod_df.to_csv('../out_files/PERperiod_df.csv', index=False)

In [5]:
print('Creating training corpora for each period...')

training_texts_dir = 'training_texts/PERperiod/'
if not os.path.exists(training_texts_dir):
    os.makedirs(training_texts_dir)

for period, speech in zip(PERperiod_df.period, PERperiod_df.speech):
    with open(training_texts_dir+str(period)+'.txt', "w") as o:
        o.write(speech)

print('Done!')

Creating training corpora for each period...
Done!
