In [3]:
import sys
sys.path.append("..")
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd

## Basic Set-up

Load and transform the raw data into format for training.


In [1]:
PREFIX = '../data'

In [90]:
users = pd.read_csv(f'{PREFIX}/corpus-webis-editorial-quality-18/corpus-webis-editorial-quality-18_annotators-personality-traits.csv')

In [91]:
users = users.reset_index()

In [95]:
ann_id_map = dict(users[['id', 'index']].values)

In [7]:
vote_data = pd.read_json(f'{PREFIX}/corpus-webis-editorial-quality-18.json')

In [129]:
# Add a numeric ID to better work with embeddings
vote_data['user_id'] = vote_data.annotator_id.map(ann_id_map)

In [130]:
vote_data['doc_id'] = vote_data.article_id.map(lambda x: int(x.replace('.txt', '')))

In [131]:
# Set-up potential labels
vote_data['y_bin_effect'] = vote_data['effect_abstracted'] != 2
vote_data['y_three_class'] = vote_data['effect_abstracted']

In [132]:
# Handle duplicate articles
vote_data = vote_data[vote_data.doc_id.isin(raw_text_data.idx)]

In [133]:
vote_data.to_csv('/home/ec2-user/final_paper_data_v2/editorials/vote_data.csv')

In [9]:
raw_text_data = pd.read_json(f'{PREFIX}/articles_with_majority_adus.json')

In [10]:
raw_text_data.head()

Unnamed: 0,idx,ids,split_label,liberal_majority,content,adu_anecdote,adu_other,adu_statistics,adu_testimony,conservative_majority
0,1851784,1851784,test,no_effect,''The familiar and the fancy are combined to s...,8,9,1,2,no_effect
1,1845399,1845399,test,reinforcing,"A $141,000 salary seems generous, but for judg...",8,13,1,0,reinforcing
2,1677322,1677322,train,reinforcing,A State Supreme Court judge yesterday removed ...,12,11,0,1,challenging
3,1818732,1818732,train,reinforcing,A battle between Yonkers and its neighbors ove...,7,8,0,0,no_effect
4,1844646,1844646,test,reinforcing,A bill headed for a vote in the Senate would u...,9,11,1,2,challenging


# Text Features

## Style Features

First separate out content and run through the LIWC 2015 engine

In [11]:
raw_text_data[['idx', 'content']].to_csv('just_text.csv')

In [12]:
liwc_data = pd.read_csv('LIWC2015 Results (just_text).csv', index_col=1)

In [13]:
# Annotate column names with LIWC
liwc_data.drop('Source (C)', inplace=True, axis=1)
liwc_data.drop('Source (A)', inplace=True, axis=1)

In [14]:
liwc_data.columns = [f'liwc_{c.lower()}' for c in liwc_data.columns]
liwc_data.head()

Unnamed: 0_level_0,liwc_wc,liwc_analytic,liwc_clout,liwc_authentic,liwc_tone,liwc_wps,liwc_sixltr,liwc_dic,liwc_function,liwc_pronoun,...,liwc_comma,liwc_colon,liwc_semic,liwc_qmark,liwc_exclam,liwc_dash,liwc_quote,liwc_apostro,liwc_parenth,liwc_otherp
Source (B),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1851784,462,86.99,73.49,4.86,41.42,25.67,29.22,71.65,46.32,9.52,...,4.76,0.22,0.0,0.0,0.0,2.38,0.0,5.19,1.3,0.22
1845399,529,92.72,46.22,27.39,39.27,25.19,22.5,78.45,43.29,5.29,...,7.94,0.0,0.19,0.19,0.0,0.95,0.0,2.65,0.0,1.51
1677322,478,96.57,49.16,7.47,32.97,20.78,25.94,75.52,43.72,3.56,...,4.81,0.21,0.0,0.0,0.0,3.35,0.0,2.93,0.42,0.84
1818732,439,94.8,58.99,33.54,29.58,31.36,25.51,77.22,45.1,6.38,...,6.15,0.0,0.0,0.0,0.0,2.28,0.0,0.91,0.46,0.46
1844646,541,97.45,50.0,8.48,22.9,19.32,23.29,84.66,46.77,5.55,...,2.96,0.55,0.0,0.0,0.0,0.74,0.0,1.48,0.0,0.0


Next, we run the custom spacy library that adds in NRC, "CMV" and MRPC style features. It will, also, produce the raw lemmas that can be used for future analysis. Specific data sources can be found in the `smart_spacy` code file.

In [15]:
from irt_lib.smart_spacy import load_custom_spacy, get_style_features

In [31]:
nlp = load_custom_spacy()

In [40]:
final_features = []

for doc in raw_text_data.content:
    feat = get_style_features(doc, nlp)
    final_features.append(feat)
    

In [44]:
spacy_features = pd.DataFrame(final_features).fillna(0)

In [48]:
liwc_data = liwc_data.reset_index()

In [74]:
all_style_data = pd.concat([ liwc_data, spacy_features], axis=1)
all_style_data = all_style_data.rename(columns={'Source (B)': 'doc_idx'})
all_style_data = all_style_data.set_index('doc_idx')

In [77]:
all_style_data = all_style_data.drop('lemmas', axis=1)

In [78]:
all_style_data.to_csv('/home/ec2-user/final_paper_data_v2/editorials/style.csv')

In [60]:
from sklearn.preprocessing import StandardScaler

In [79]:
ss = StandardScaler()

In [80]:
vals = all_style_data.values
vals2 = ss.fit_transform(vals)

In [83]:
all_style_data_v2 = pd.DataFrame(vals2, index=all_style_data.index, columns=all_style_data.columns)
all_style_data_v2.columns = [f'{x}_scaled' for x in all_style_data_v2.columns]

In [85]:
all_style_data_v2.to_csv('/home/ec2-user/final_paper_data_v2/editorials/style_scaled.csv')

In [143]:
all_style_data_v2

Unnamed: 0_level_0,liwc_wc_scaled,liwc_analytic_scaled,liwc_clout_scaled,liwc_authentic_scaled,liwc_tone_scaled,liwc_wps_scaled,liwc_sixltr_scaled,liwc_dic_scaled,liwc_function_scaled,liwc_pronoun_scaled,...,mpqa_priority_scaled,mpqa_structure_scaled,mpqa_inconsistency_scaled,mpqa_difficulty_scaled,mpqa_possibility_scaled,mpqa_authority_scaled,mpqa_generalization_scaled,mpqa_doubt_scaled,mpqa_rhetoricalquestion_scaled,mpqa_wants_scaled
doc_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1851784,-0.381421,-1.014132,1.584051,-1.087942,0.179599,0.832142,0.965911,-2.020109,0.500657,1.402228,...,-0.845749,-0.587886,-0.838681,-0.260534,-0.443580,-0.298308,-0.231994,-0.128898,-0.090769,-0.136859
1845399,0.891529,0.117533,-1.247757,0.624055,0.091401,0.702190,-1.062379,-0.165715,-0.613263,-0.987831,...,-0.845749,-0.587886,-0.838681,-0.260534,-0.443580,-0.298308,-0.231994,-0.128898,-0.090769,-0.136859
1677322,-0.077433,0.877901,-0.942457,-0.889615,-0.167042,-0.491741,-0.024088,-0.964741,-0.455182,-1.965326,...,1.186124,-0.587886,-0.838681,-0.260534,-0.443580,-0.298308,-0.231994,-0.128898,-0.090769,-0.136859
1818732,-0.818405,0.528329,0.078323,1.091378,-0.306109,2.372610,-0.153874,-0.501142,0.052148,-0.371953,...,-0.845749,-0.587886,-0.838681,-0.260534,-0.443580,-0.298308,-0.231994,-0.128898,-0.090769,-0.136859
1844646,1.119521,1.051699,-0.855229,-0.812868,-0.580141,-0.887010,-0.823934,1.527783,0.666091,-0.840925,...,1.186124,1.140464,0.651462,3.604053,-0.443580,-0.298308,-0.231994,-0.128898,-0.090769,-0.136859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1693956,-1.255388,1.110949,-0.062904,1.459917,-0.615010,-0.486326,1.071551,-0.517505,-0.973541,-1.801468,...,-0.845749,-0.587886,-0.838681,-0.260534,-0.443580,-0.298308,4.310452,-0.128898,-0.090769,-0.136859
1652153,-0.913401,0.828526,-0.002675,-0.258919,1.320030,0.409799,1.047405,-0.132990,-0.010349,-0.592313,...,-0.845749,-0.587886,0.651462,-0.260534,-0.443580,-0.298308,-0.231994,-0.128898,-0.090769,-0.136859
1806988,-1.312386,0.469080,0.522772,1.069341,-0.118636,0.458531,0.039296,0.229707,0.118321,0.125270,...,0.170187,-0.587886,2.141605,-0.260534,6.037985,-0.298308,-0.231994,-0.128898,-0.090769,-0.136859
1820483,-0.780406,0.419705,-1.323563,2.616446,-0.738899,0.905240,1.865361,-0.719306,-1.271321,-1.801468,...,4.233933,-0.587886,-0.838681,-0.260534,-0.443580,-0.298308,-0.231994,-0.128898,-0.090769,-0.136859


### Test DataLoader

In [116]:
from lib.editorial_data_helper import create_full_data

In [138]:
test = create_full_data()

In [140]:
test.user_id.unique()

array([ 3,  2,  0,  1, 17, 11,  5, 23,  9,  6, 10,  8, 12, 18, 15, 20, 22,
       16, 19,  4, 21,  7, 14, 13])

In [144]:
test = create_full_data(feature_types=['style', 'style_scaled'])

## Quality Model

Use the IBM Quality Model to assign additional features

In [5]:
raw_text_data = pd.read_json('/home/ec2-user/articles_with_majority_adus.json')

In [6]:
from lib.quality_model import QualityModelLabeler

In [7]:
# Trained ahead of time - running quality model as a script will produce this model.
import os
qmodel = QualityModelLabeler(path=os.path.expanduser('~/final_paper_data_v2/models/final_ibm_quality/'))

In [8]:
from tqdm import tqdm
all_features = []
for _, row in tqdm(raw_text_data.iterrows(), mininterval=350, total=len(raw_text_data)):
    text = row.content
    
    stats = qmodel.label_sent_stats(text)
    all_features.append(stats)


  0%|          | 0/979 [00:00<?, ?it/s]

100%|██████████| 979/979 [01:55<00:00,  8.48it/s]


In [9]:
ibm_feats = pd.DataFrame(all_features, index=raw_text_data['idx'])
ibm_feats.columns = [f'ibm_{c}' for c in ibm_feats.columns]

In [10]:
style_data = pd.read_csv('/home/ec2-user/final_paper_data_v2/editorials/style.csv', index_col=0)

In [11]:
s2 = pd.concat([style_data, ibm_feats], axis=1)

In [12]:
s2.to_csv('/home/ec2-user/final_paper_data_v2/editorials/style_quality.csv')

In [15]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(ibm_feats.values)

In [16]:
df = pd.DataFrame(X, index=ibm_feats.index, columns=ibm_feats.columns)

In [17]:
df.columns = [f'{c}_scaled' for c in df.columns]

In [215]:
style_data = pd.read_csv('/home/ec2-user/final_paper_data_v2/editorials/style_scaled.csv', index_col=0)

In [216]:
s3 = pd.concat([style_data, df], axis=1)

In [217]:
s3.to_csv('/home/ec2-user/final_paper_data_v2/editorials/style_quality_scaled.csv')

## Text Features

words and lemmas

### Binary Lemma Features

In [151]:
index = all_style_data_v2.index.values

In [153]:
texts = spacy_features['lemmas'].values.tolist()

In [154]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [160]:
vec1 = CountVectorizer(binary=True, preprocessor=lambda x: x, tokenizer=lambda x: [y.lower() for y in x if y.isalpha()], max_features=10000, min_df=5)

In [161]:
textX = vec1.fit_transform(texts)

In [162]:
textX.A

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 1, 0]])

In [164]:
len(vec1.get_feature_names())

5167

In [166]:
text_df = pd.DataFrame(textX.A, columns=vec1.get_feature_names(), index=index)

In [168]:
text_df.to_csv('/home/ec2-user/final_paper_data_v2/editorials/text_lemma_bin.csv')

### Standard TF-IDF

In [169]:
alt_text = raw_text_data.content.values

In [187]:
vec2 = TfidfVectorizer(min_df=5, max_features=10000)
X = vec2.fit_transform(alt_text)

In [None]:
vec2.get_feature_names()

In [190]:
text_df = pd.DataFrame(X.A, columns=vec2.get_feature_names(), index=index)

In [191]:
text_df.to_csv('/home/ec2-user/final_paper_data_v2/editorials/text_raw_tfidf.csv')