In [1]:
import unicodedata
from collections import defaultdict
import re
from collections import Counter
import operator
import pandas as pd
import time
import datetime
from tqdm import tqdm
import tqdm.notebook as tq

print(datetime.datetime.now().time())

# \u00b7 is middle dot
# \u0387 is Greek ano teleia
punct_regex = re.compile(r"([?.!,;\u00b7\u0387])")

def str_clean(s):
   normalized = ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')
   separate_punct = re.sub(punct_regex, r" \1 ", normalized)
   collapse_spaces  = re.sub(r'\s+', " ", separate_punct)
   lc = collapse_spaces.lower()
   return lc

word_freq = defaultdict(int)

df = pd.read_csv('../out_files/tell_all_cleaned.csv') #, nrows=10
print('read input file')
df = df[df['speech'].notna()]
print('cleaning speeches...')
df.speech = df.speech.apply(lambda x: str_clean(x))
print('speeches cleaned')

df.speech = df.speech.apply(lambda x: x.replace(".", " "))
df.sitting_date = pd.to_datetime(df.sitting_date, format="%d/%m/%Y")

#New column year
df['year'] = df['sitting_date'].dt.year


19:52:53.303430
read input file
cleaning speeches...
speeches cleaned


In [3]:

PERyear_df = df.copy().groupby(df.year)['speech'].apply(''.join).reset_index() #concat sentences, each last sentence for each speech did not have dot so add one.

mask1 = (PERyear_df['year'] >= 1997) & (PERyear_df['year'] <= 2007)
corpus_before = '\n'.join([text for text in PERyear_df.loc[mask1].speech])

mask2 = (PERyear_df['year'] >= 2008) & (PERyear_df['year'] <= 2018)
corpus_after = '\n'.join([text for text in PERyear_df.loc[mask2].speech])

crisis_dichotomy_df = pd.DataFrame(data=[['1997_2007', corpus_before],
                                         ['2008_2018', corpus_after]],
                                   columns = ['period', 'speech'])


In [4]:

periods = crisis_dichotomy_df.period.to_list()


for period in periods:
    print(period)
    subdf = crisis_dichotomy_df.loc[(crisis_dichotomy_df.period==period)]

    tell_all = subdf.speech.iloc[0].lower()

    # tell_all = re.sub("\d+", "", tell_all)
    tell_all = re.sub("\s\s+" , ' ', tell_all)

    freqs = Counter()
    subdf.speech.apply(lambda x: freqs.update(x.split()))
    print('finished counting')
    total_number = sum(freqs.values())
    print('total number of tokens:', total_number)

    freqs_df = pd.DataFrame.from_dict(freqs, orient='index',
                                      columns=['frequency'])
    freqs_df = freqs_df.reset_index()

    freqs_df = freqs_df.rename(columns={'index': 'word'})
    mask = (freqs_df['word'].str.len() > 1)
    freqs_df = freqs_df.loc[mask]
    print('Removed entries with one character.')

    freqs_df = freqs_df.sort_values('frequency').reset_index(drop=True)

    freqs_df['percentage'] = freqs_df['frequency'] / total_number

    freqs_df.to_csv('../out_files/freqs_for_semantic_shift_cleaned_data_period'+str(period)+'.csv', index=False)

print(datetime.datetime.now().time())


1997_2007
finished counting
total number of tokens: 59484378
Removed entries with one character.
2008_2018
finished counting
total number of tokens: 72465035
Removed entries with one character.
19:58:09.130683
