In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc='Cleaning')

from scorers.cleaning import *
from scorers.specificity_vs_vagueness import *
from scorers.objectivity_vs_subjectivity import *
from scorers.rationality_vs_emotionality import *

In [2]:
## Loading and cleaning datasets

# Manifestos
manifestos = pd.read_csv('../manifesto-forewords/manifestos.csv')
manifestos.year = pd.to_datetime(manifestos.year)
manifestos = manifestos[manifestos.year>='1945-01-01']
# PMQs answers
pmqs = pd.read_csv('../hansard-pmqs/hansard_pmqs.csv')
pmqs.date = pd.to_datetime(pmqs.date)
pmqs = pmqs[pmqs.date>='1945-01-01']
pmqs = pd.DataFrame({'date':pmqs.groupby(by='date').answer_text.aggregate(lambda x: ''.join(x)).index,
                     'answer_text':pmqs.groupby(by='date').answer_text.aggregate(lambda x: ''.join(x)),
                     'answerer_party':pmqs.groupby(by='date').answerer_party.aggregate(lambda x: x.mode()),
                     'answerer_name':pmqs.groupby(by='date').answerer_name.aggregate(lambda x: x.mode())})
# Conference speeches
conferences = pd.read_csv('../conference-speeches/conference.csv')
conferences = conferences.drop('Unnamed: 0', axis=1)
conferences.year = conferences.year.apply(lambda year: pd.to_datetime(str(year)+'-01-01'))
conferences = conferences[conferences.year>='1945-01-01']

In [3]:
## Scoring for specificity vs. vagueness

# Manifestos
manifestos_clean = manifestos.foreword.progress_apply(lambda x: clean(x)) # ...clean them
manifestos_nans = [idx for idx,foreword in enumerate(manifestos_clean) if len(foreword)==0] # ...save NAN indexes to a list
manifestos_scores = [measure_vagueness(foreword) for foreword in tqdm(manifestos_clean,desc='Scoring manifestos') if len(foreword)>0] # ...score them
manifestos_vecs = [list(score_dict.values()) for score_dict in tqdm(manifestos_scores,desc='Saving manifesto scores')] # ...save scores as a list
# PMQs answers
pmqs_clean = pmqs.answer_text.progress_apply(lambda x: clean(x))
pmqs_nans = [idx for idx,answer in enumerate(pmqs_clean) if len(answer)==0]
pmqs_scores = [measure_vagueness(answer) for answer in tqdm(pmqs_clean,desc='Scoring PMQs answers') if len(answer)>0]
pmqs_vecs = [list(score_dict.values()) for score_dict in tqdm(pmqs_scores,desc='Saving PMQs answer scores')]
# Conference speeches
conferences_clean = conferences.content.progress_apply(lambda x: clean(x))
conferences_nans = [idx for idx,speech in enumerate(conferences_clean) if len(speech)==0]
conferences_scores = [measure_vagueness(speech) for speech in tqdm(conferences_clean,desc='Scoring conference speeches') if len(speech)>0]
conferences_vecs = [list(score_dict.values()) for score_dict in tqdm(conferences_scores,desc='Saving conference speech scores')]

for key,value in {key:[score_dict[key] for score_dict in manifestos_scores] for key in manifestos_scores[0].keys()}.items():
    manifestos["vague_"+key] = value
for key,value in {key:[score_dict[key] for score_dict in pmqs_scores] for key in pmqs_scores[0].keys()}.items():
    pmqs["vague_"+key] = value
for key,value in {key:[score_dict[key] for score_dict in conferences_scores] for key in conferences_scores[0].keys()}.items():
    conferences["vague_"+key] = value

Cleaning: 100%|██████████████████████████████████████████████████████████████████████| 76/76 [00:00<00:00, 6870.87it/s]
Scoring manifestos: 100%|██████████████████████████████████████████████████████████████| 76/76 [00:01<00:00, 43.94it/s]
Saving manifesto scores: 100%|█████████████████████████████████████████████████████████████████| 76/76 [00:00<?, ?it/s]
Cleaning: 100%|██████████████████████████████████████████████████████████████████| 1806/1806 [00:00<00:00, 2344.29it/s]
Scoring PMQs answers: 100%|████████████████████████████████████████████████████████| 1806/1806 [00:18<00:00, 97.33it/s]
Saving PMQs answer scores: 100%|████████████████████████████████████████████████| 1806/1806 [00:00<00:00, 51395.06it/s]
Cleaning: 100%|█████████████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 809.74it/s]
Scoring conference speeches: 100%|███████████████████████████████████████████████████| 183/183 [00:05<00:00, 31.85it/s]
Saving conference speech scores: 100%|██

In [4]:
## Scoring for objectivity vs. subjectivity

# Manifestos
manifestos['clean'] = manifestos.foreword.progress_apply(lambda x: clean(x)) # ...clean them
manifestos_nans = [idx for idx,foreword in enumerate(manifestos['clean']) if len(foreword)==0] # ...save NAN indexes to a list
manifestos_scores = [measure_subjectivity(text_list=row.clean, raw_text=row.foreword) for idx,row in tqdm(manifestos.iterrows(),desc='Scoring manifesto forewords') if len(row.clean)>0] # ...score them
manifestos_vecs = [list(score_dict.values()) for score_dict in tqdm(manifestos_scores,desc='Saving manifesto scores')] # ...save scores as a list
# # PMQs answers
pmqs['clean'] = pmqs.answer_text.progress_apply(lambda x: clean(x))
pmqs_nans = [idx for idx,answer in enumerate(pmqs['clean']) if len(answer)==0]
pmqs_scores = [measure_subjectivity(text_list=row.clean, raw_text=row.answer_text) for idx,row in tqdm(pmqs.iterrows(),desc='Scoring PMQs answers') if len(row.clean)>0]
pmqs_vecs = [list(score_dict.values()) for score_dict in tqdm(pmqs_scores,desc='Saving PMQs answer scores')]
# # Conference speeches
conferences['clean'] = conferences.content.progress_apply(lambda x: clean(x))
conferences_nans = [idx for idx,speech in enumerate(conferences['clean']) if len(speech)==0]
conferences_scores = [measure_subjectivity(text_list=row.clean, raw_text=row.content) for idx,row in tqdm(conferences.iterrows(),desc='Scoring conference speeches') if len(row.clean)>0]
conferences_vecs = [list(score_dict.values()) for score_dict in tqdm(conferences_scores,desc='Saving conference speech scores')]

for key,value in {key:[score_dict[key] for score_dict in manifestos_scores] for key in manifestos_scores[0].keys()}.items():
    manifestos["subj_"+key] = value
for key,value in {key:[score_dict[key] for score_dict in pmqs_scores] for key in pmqs_scores[0].keys()}.items():
    pmqs["subj_"+key] = value
for key,value in {key:[score_dict[key] for score_dict in conferences_scores] for key in conferences_scores[0].keys()}.items():
    conferences["subj_"+key] = value

Cleaning: 100%|██████████████████████████████████████████████████████████████████████| 76/76 [00:00<00:00, 4450.00it/s]
Scoring manifesto forewords: 76it [00:04, 16.68it/s]
Saving manifesto scores: 100%|██████████████████████████████████████████████████████| 76/76 [00:00<00:00, 75662.74it/s]
Cleaning: 100%|██████████████████████████████████████████████████████████████████| 1806/1806 [00:00<00:00, 2415.66it/s]
Scoring PMQs answers: 1806it [05:30,  5.47it/s]
Saving PMQs answer scores: 100%|███████████████████████████████████████████████| 1806/1806 [00:00<00:00, 600325.96it/s]
Cleaning: 100%|█████████████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 809.05it/s]
Scoring conference speeches: 183it [01:34,  1.93it/s]
Saving conference speech scores: 100%|███████████████████████████████████████████████████████| 183/183 [00:00<?, ?it/s]


In [5]:
## Scoring for rationality vs. emotionality

# Manifestos
manifestos['clean'] = manifestos.foreword.progress_apply(lambda x: clean(x)) # ...clean them
manifestos_nans = [idx for idx,foreword in enumerate(manifestos['clean']) if len(foreword)==0] # ...save NAN indexes to a list
manifestos_scores = [measure_emotionality(text_list=row.clean, raw_text=row.foreword) for idx,row in tqdm(manifestos.iterrows(),desc='Scoring manifesto forewords') if len(row.clean)>0] # ...score them
manifestos_vecs = [list(score_dict.values()) for score_dict in tqdm(manifestos_scores,desc='Saving manifesto scores')] # ...save scores as a list
# # PMQs answers
pmqs['clean'] = pmqs.answer_text.progress_apply(lambda x: clean(x))
pmqs_nans = [idx for idx,answer in enumerate(pmqs['clean']) if len(answer)==0]
pmqs_scores = [measure_emotionality(text_list=row.clean, raw_text=row.answer_text) for idx,row in tqdm(pmqs.iterrows(),desc='Scoring PMQs answers') if len(row.clean)>0]
pmqs_vecs = [list(score_dict.values()) for score_dict in tqdm(pmqs_scores,desc='Saving PMQs answer scores')]
# # Conference speeches
conferences['clean'] = conferences.content.progress_apply(lambda x: clean(x))
conferences_nans = [idx for idx,speech in enumerate(conferences['clean']) if len(speech)==0]
conferences_scores = [measure_emotionality(text_list=row.clean, raw_text=row.content) for idx,row in tqdm(conferences.iterrows(),desc='Scoring conference speeches') if len(row.clean)>0]
conferences_vecs = [list(score_dict.values()) for score_dict in tqdm(conferences_scores,desc='Saving conference speech scores')]

for key,value in {key:[score_dict[key] for score_dict in manifestos_scores] for key in manifestos_scores[0].keys()}.items():
    manifestos["emot_"+key] = value
for key,value in {key:[score_dict[key] for score_dict in pmqs_scores] for key in pmqs_scores[0].keys()}.items():
    pmqs["emot_"+key] = value
for key,value in {key:[score_dict[key] for score_dict in conferences_scores] for key in conferences_scores[0].keys()}.items():
    conferences["emot_"+key] = value

Cleaning: 100%|██████████████████████████████████████████████████████████████████████| 76/76 [00:00<00:00, 4470.85it/s]
Scoring manifesto forewords: 76it [00:07, 10.73it/s]
Saving manifesto scores: 100%|██████████████████████████████████████████████████████| 76/76 [00:00<00:00, 76296.58it/s]
Cleaning: 100%|██████████████████████████████████████████████████████████████████| 1806/1806 [00:00<00:00, 2297.89it/s]
Scoring PMQs answers: 1806it [08:33,  3.51it/s]
Saving PMQs answer scores: 100%|███████████████████████████████████████████████| 1806/1806 [00:00<00:00, 602282.98it/s]
Cleaning: 100%|█████████████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 793.26it/s]
Scoring conference speeches: 183it [02:32,  1.20it/s]
Saving conference speech scores: 100%|███████████████████████████████████████████████████████| 183/183 [00:00<?, ?it/s]


In [6]:
## Save datasets

# Individually...
manifestos.to_csv('scored_datasets/manifestos.csv', index=False)
pmqs.to_csv('scored_datasets/pmqs.csv', index=False)
conferences.to_csv('scored_datasets/conferences.csv', index=False)

# And merged...
party_dict = {'Labour':'LAB',
              'Conservative':'CON',
              'Liberal':'LIB',
              'SDP-Liberal Alliance':'LIB',
              'Liberal Democrat':'LIB',
              np.nan:'NAN'}

score_vars = [colname for colname in manifestos.columns if any([prefix in colname for prefix in ['vague','subj','emot']])]
score_dict = {'type':[],'year':[],'party':[]}
score_dict.update({score_var:[] for score_var in score_vars})
score_df = pd.DataFrame(score_dict)

manifesto_dict = {'type':['manifesto']*len(manifestos_vecs),
                  'year':manifestos.drop(manifestos_nans, axis=0).year,
                  'party':manifestos.drop(manifestos_nans, axis=0).party}
manifesto_dict.update({score_var:manifestos[score_var] for score_var in score_vars})
manifesto_df = pd.DataFrame(manifesto_dict)

pmqs_dict = {'type':['PMQs']*len(pmqs_vecs),
             'year':pmqs.drop(pmqs_nans, axis=0).date,
             'party':pmqs.drop(pmqs_nans, axis=0).answerer_party.apply(lambda party: party_dict[str(party)] if str(party) in party_dict.keys() else 'NAN')}
pmqs_dict.update({score_var:pmqs[score_var] for score_var in score_vars})
pmqs_df = pd.DataFrame(pmqs_dict)

conferences_dict = {'type':['conference']*len(conferences_vecs),
                    'year':conferences.drop(conferences_nans, axis=0).year,
                    'party':conferences.drop(conferences_nans, axis=0).party.apply(lambda party: party_dict[party])}
conferences_dict.update({score_var:conferences[score_var] for score_var in score_vars})
conferences_df = pd.DataFrame(conferences_dict)

for df in [manifesto_df,pmqs_df,conferences_df]:
    score_df = score_df.append(df)
    
score_df.to_csv('scored_datasets/combined.csv', index=False)