In [None]:
from collections import Counter, defaultdict
import json

import pandas as pd

# Hack to import our models
import sys
sys.path.append("..")


%load_ext autoreload
%autoreload 2

In [None]:
# location where data will be stored
PREFIX = "../data"

In [None]:
# https://www.cs.cornell.edu/~esindurmus/ddo.html
data = pd.read_json(PREFIX + '/debates.json').T

# Prefiltering

First, the debates are filtered down to the set that meets the minimum criteria

In [None]:
data = data[data.participant_1_status != 'Tied']

In [None]:
# Debate must have vote
data = data[data.votes.map(lambda x: len(x) > 0)]

# Some debates have a forfeit label - for others we check the text
data['has_forfeit'] = data.rounds.map(lambda x: any('forfeit' in t['text'].lower() for round in x for t in round))
data = data[~data.has_forfeit]
data = data[data.forfeit_label == False]

In [None]:
# Get rid of debates with weird vote style
data = data[data.votes.map(lambda x: 'Who won the debate' not in str(x))]

In [None]:
# Some debates have weird empty rounds
data['rounds'] = data.rounds.map(lambda xs: [x for x in xs if len(x) == 2])

In [None]:
# Only keep debates with three or more rounds
data = data[data.rounds.map(lambda x: len(x) >= 3)]

In [None]:
# must have 100+ words
s1 = data.rounds.map(lambda xs: len(' '.join([z[0]['text'] for z in xs]).split()))
data = data[s1 > 100]
s2 = data.rounds.map(lambda xs: len(' '.join([z[1]['text'] for z in xs]).split()))
data = data[s2 > 100]

In [None]:
GOOD_CATEGORIES = ['Politics', 'Religion', 'Society', 'Philosophy', 'Education', 'Economics']
data = data[data.category.isin(GOOD_CATEGORIES)]

In [None]:
# Pick debates that appear to have at least moderate engagement
data = data[(data.participant_1_points + data.participant_2_points) >= 7]

In [None]:
data.url = data.url.map(lambda x: x.replace('http://www.debate.org/debates/', ''))

In [None]:
data.to_json(PREFIX + '/debates_filtered.json')

Next, transform into separate rows by "side". Each one will be a separate datapoint, so each user gets two per debate.

In [None]:
def quick_clean(t):
    t = t.replace('\n', ' ').replace('&gt', '').replace('\r', ' ').replace('\t', ' ').replace('  ', ' ').replace('  ', ' ')
    
    return t

sep_data = []

for _, row in data.iterrows():

    first_round = row.rounds[0]
    
    # Another weird edge case?
    if len(first_round) != 2:
        continue
        
    sides = {first_round[0]['side']: quick_clean(first_round[0]['text']),
             first_round[1]['side']: quick_clean(first_round[1]['text'])}
    
    sides_full = {'Pro': [quick_clean(arg['text']) for round in row.rounds for arg in round if arg['side'] == 'Pro'],
                  'Con': [quick_clean(arg['text']) for round in row.rounds for arg in round if arg['side'] == 'Con']}
    
    winning_side = None
    winner_name = None
    
    if row.participant_1_status == 'Winning':
        winning_side = row.participant_1_position
        
        winner_name = row.participant_1_name
        loser_name = row.participant_2_name
        
        win_points = row.participant_1_points
        lose_points = row.participant_2_points
        
        win_went_first = True
    else: 
        winning_side = row.participant_2_position
        winner_name = row.participant_2_name
        loser_name = row.participant_1_name
        
        win_points = row.participant_2_points
        lose_points = row.participant_1_points
        
        win_went_first = False
    
    
    losing_side = 'Pro' if winning_side == 'Con' else 'Con'
    
    # Regardless of vote, count how many minds were changed
    now_agree = 0
    affirm = 0
    detract = 0
    for vote in row.votes:
        v = vote['votes_map'][winner_name]
        if len(v) == 1:
            continue
        if not v['Agreed with before the debate'] and v['Agreed with after the debate']:
            now_agree += 1
        if v['Agreed with before the debate'] and v['Agreed with after the debate']:
            affirm += 1
        if v['Agreed with before the debate'] and not v['Agreed with after the debate']:
            detract += 1
    
    
    win_data = {'name': winner_name, 
                'first_text': sides[winning_side], 
                 'text': sides_full[winning_side],                
                'position': winning_side, 
                'won': True, 
                'url': row.url,
                'category': row.category,
                'title': row.title,
                'challenged': now_agree,
                'affirmed': affirm,
                'detracted': detract,
                'total_voters': row.number_of_votes,
                'total_points': win_points,
                'went_first': win_went_first}
    
    
    # Same table for the opposite side
    now_agree = 0
    affirm = 0
    detract = 0
    for vote in row.votes:
        
        v = vote['votes_map'][loser_name]
        if len(v) == 1:
            continue
        
        if not v['Agreed with before the debate'] and v['Agreed with after the debate']:
            now_agree += 1
        if v['Agreed with before the debate'] and v['Agreed with after the debate']:
            affirm += 1
        if v['Agreed with before the debate'] and not v['Agreed with after the debate']:
            detract += 1
    
    lose_data = {'name': loser_name, 
                 'first_text': sides[losing_side], 
                 'text': sides_full[losing_side],
                 'position': losing_side, 
                 'won': False,
                 'url': row.url, 
                 'category': row.category, 
                 'title': row.title,
                 'challenged': now_agree,
                 'affirmed': affirm,
                 'detracted': detract,
                 'total_voters': row.number_of_votes,
                 'total_points': lose_points,
                 'went_first': not win_went_first}
    
    sep_data.append(win_data)
    sep_data.append(lose_data)
    

In [None]:
data2 = pd.DataFrame(sep_data)

In [None]:
data2.head()

In [None]:
data2['debate_side_id'] = data2['url'] + data2['name']

In [None]:
data2.to_json(PREFIX + '/debates_filtered_by_side.json')

Next, we create a list of features for "usable" users - aka those that have voted at least 10 times and agreed with someone after the debate at least once

In [None]:
from collections import Counter
voter_counts = Counter()

for _, row in data.iterrows():
    
    for voter in row.votes:
        voter_counts[voter['user_name']] += 1

In [None]:
len(voter_counts)

In [None]:
voters = [k for k, v in voter_counts.items() if v >= 10]

len(data), len(voters)

In [None]:
had_opine = defaultdict(lambda: False)

for _, row in data.iterrows():
    
    for voter in row.votes:
        for name, side in voter['votes_map'].items():
            if name == 'Tied':
                continue
            if side['Agreed with after the debate']:
                had_opine[voter['user_name']] = True


In [None]:
# Used agreed with at least once
user_with_opine = [user for user in voters if had_opine[user] == True]

In [None]:
# Review debate list for those containig one of these users

In [None]:
good_user_present = []

for _, row in data.iterrows():
    if any(u['user_name'] in voters for u in row.votes):
        good_user_present.append(True)
    else:
        good_user_present.append(False)

In [None]:
# Save updated set
data = data[good_user_present]
data.to_json(PREFIX + '/debates_filtered.json')

In [None]:
data2 = data2[data2.url.isin(data.url)]

In [None]:
data2.reset_index(inplace=True)

In [None]:
data2.drop('index', inplace=True, axis=1)

In [None]:
data2.to_json(PREFIX + '/debates_filtered_by_side.json')

In [None]:
len(data), len(data2), len(voters)

Create a table with rows for every voter+debate-side pair

In [None]:
side_data = pd.read_json(PREFIX + '/debates_filtered_by_side.json')

In [None]:
full_voter_data = []

for _, row in side_data.iterrows():
    all_vote_info = data[data.url == row.url].iloc[0].votes
    
    for voter_data in all_vote_info:
        if voter_data['user_name'] in voters:
            
            full_voter_data.append([voter_data['user_name'], row['debate_side_id'], row['name'], row['url'], 
                                    voter_data['votes_map'][row['name']]])
            
            

In [None]:
full_voter_data = pd.DataFrame(full_voter_data, columns=['voter_name', 'debate_side_id', 'speaker_name','url', 'vote_info'])

In [None]:
full_voter_data['points'] = full_voter_data.vote_info.map(lambda x: x['Total points awarded'])

In [None]:
full_voter_data['affirmed'] = full_voter_data.vote_info.map(lambda x: x.get('Agreed with before the debate') & x.get('Agreed with after the debate'))

In [None]:
full_voter_data['agree_after'] = full_voter_data.vote_info.map(lambda x: x.get('Agreed with after the debate'))

In [None]:
full_voter_data['challenged'] = full_voter_data.vote_info.map(lambda x: (not x.get('Agreed with before the debate')) & x.get('Agreed with after the debate'))

In [None]:
vv = list(set(full_voter_data.voter_name))

full_voter_data['voter_id'] = full_voter_data.voter_name.map(lambda x: vv.index(x))

In [None]:
len(full_voter_data)

In [None]:
# Mystery issue
full_voter_data = full_voter_data.drop_duplicates(['voter_name', 'debate_side_id', 'speaker_name'])

In [None]:
len(full_voter_data)

In [None]:
full_voter_data.to_csv(PREFIX + '/debate_voter_data.csv')

In [None]:
# Who did the particular voter give more points?
full_win_map = {}
for ids, rows in full_voter_data.groupby(['url', 'voter_name']):
    if len(rows) != 2:
        print("??")
        continue
    p1 = int(rows.iloc[0].points)
    p2 = int(rows.iloc[1].points)
    if p1 > p2:
        full_win_map[(ids[0], ids[1], rows.iloc[0].speaker_name)] = True
        full_win_map[(ids[0], ids[1], rows.iloc[1].speaker_name)] = False

    elif p2 > p1: 
        full_win_map[(ids[0], ids[1], rows.iloc[1].speaker_name)] = True
        full_win_map[(ids[0], ids[1], rows.iloc[0].speaker_name)] = False

    else:
        full_win_map[(ids[0], ids[1], rows.iloc[0].speaker_name)] = 'Tie'
        full_win_map[(ids[0], ids[1], rows.iloc[1].speaker_name)] = 'Tie'

In [None]:
full_voter_data['more_points'] = full_voter_data.apply(lambda row: full_win_map.get((row.url, row.voter_name, row.speaker_name)), axis=1)

In [None]:
full_voter_data.more_points.value_counts()

In [None]:
# Drop the ties
full_voter_data = full_voter_data[full_voter_data.more_points != 'Tie']

In [None]:
full_voter_data.to_csv(PREFIX + '/debate_voter_data.csv')

In [None]:
small = full_voter_data[['debate_side_id', 'voter_id', 'more_points']]
small.columns = ['doc_id', 'user_id', 'y_bin_points']

In [None]:
small.to_csv(PREFIX + '/debates/vote_data.csv')

# Feature Creation

## Style features

- LIWC
- Text Blob sentiment/subjectivity
-  cmv_concrete', 'cmv_valence','cmv_arousal', 'cmv_dominance' (CMV annotated b/c thats the paper that used them)
- MPQA



In [None]:
from irt_lib.smart_spacy import load_custom_spacy, get_style_features

In [None]:
nlp = load_custom_spacy()

In [None]:
# Combine all round texts for pre-processing
side_data['text'] = side_data.text.map(lambda x: ' '.join(x).strip().replace('  ', ' '))

In [None]:
final_features = []
i = 0
from tqdm import tqdm

for doc in tqdm(side_data.text, mininterval=300, miniters=100):
    feat = get_style_features(doc, nlp)
    final_features.append(feat)
    

In [None]:
len(final_features), len(side_data)

In [None]:
style_data = pd.DataFrame(final_features, index=side_data.debate_side_id)

In [None]:
lemmas = style_data.lemmas 

In [None]:
lemmas.to_csv(PREFIX + '/debate_by_lemmas.csv')

In [None]:
style_data = style_data.drop('lemmas', axis=1)
style_data = style_data.fillna(0)

In [None]:
liwc_data = pd.read_csv('LIWC2015 Results (debates_filtered_by_side (1).csv).csv')

In [None]:
liwc_data = liwc_data[liwc_data.columns[15:]]

In [None]:
liwc_data = liwc_data.set_index('P')

In [None]:
column_map = {k:f'liwc_{k.lower()}' for k in liwc_data.columns}


In [None]:
liwc_data = liwc_data.rename(columns=column_map)

In [None]:
style_data = pd.concat([liwc_data, style_data], axis=1)

In [None]:
style_data.to_csv(PREFIX + '/debates/style.csv')

Produce scaled copy of the features

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
style_data = pd.read_csv(PREFIX + '/debates/style.csv', index_col=0)

In [None]:
ss = StandardScaler()

In [None]:
vals = style_data.values
vals2 = ss.fit_transform(vals)

In [None]:
all_style_data_v2 = pd.DataFrame(vals2, index=style_data.index, columns=style_data.columns)
all_style_data_v2.columns = [f'{x}_scaled' for x in all_style_data_v2.columns]

In [None]:
all_style_data_v2.to_csv(PREFIX + '/debates/style_scaled.csv')

## Text Features

In [None]:
debate_data = pd.read_json(PREFIX + '/debates_filtered_by_side.json')


In [None]:
all_texts = debate_data.text.map(lambda x: ' '.join(x).strip().replace('  ', ' '))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec2 = TfidfVectorizer(min_df=5, max_features=10000)
X = vec2.fit_transform(all_texts)

In [None]:
text_df = pd.DataFrame(X.A, columns=vec2.get_feature_names(), index=debate_data.debate_side_id)
text_df.to_csv(PREFIX + '/debates/text_raw_tfidf.csv')

Preprocess lemmatized copy of the data

In [None]:
lemmas = pd.read_csv(PREFIX + '/debate_by_lemmas.csv')

In [None]:
import json
lemmas['lemmas2'] = lemmas.lemmas.map(lambda x: eval(x))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [None]:
vec1 = CountVectorizer(binary=True, preprocessor=lambda x: x, tokenizer=lambda x: [y.lower() for y in x if y.isalpha()], max_features=10000, min_df=5)

In [None]:
textX = vec1.fit_transform(lemmas.lemmas2)

In [None]:
len(vec1.get_feature_names())

In [None]:
text_df = pd.DataFrame(textX.A, columns=vec1.get_feature_names(), index=lemmas.debate_side_id)

In [None]:
text_df.to_csv(PREFIX + '/debates/text_lemma_bin.csv')

## Argument Quality

Model trained separately - see `irt_lib/quality_model.py`

In [None]:
from lib.quality_model import QualityModelLabeler

In [None]:
import os
qmodel = QualityModelLabeler(path=os.path.expanduser('~/final_paper_data_v2/models/final_ibm_quality/'))

In [None]:
from tqdm import tqdm
all_features = []
for _, row in tqdm(debate_data.iterrows(), mininterval=350, total=len(debate_data)):
    text = ' '.join(row.text)
    
    stats = qmodel.label_sent_stats(text)
    all_features.append(stats)


In [None]:
ibm_feats = pd.DataFrame(all_features)
ibm_feats.index = debate_data.debate_side_id

In [None]:
ibm_feats.columns = [f'ibm_{c}' for c in ibm_feats.columns]

In [None]:
style_data = pd.read_csv(PREFIX + '/debates/style.csv', index_col=0)

In [None]:
s2 = pd.concat([style_data, ibm_feats], axis=1)

In [None]:
s2.to_csv(PREFIX + '/debates/style_quality.csv')

In [None]:
X = StandardScaler().fit_transform(ibm_feats.values)

In [None]:
df = pd.DataFrame(X, index=ibm_feats.index, columns=ibm_feats.columns)

In [None]:
style_data = pd.read_csv(PREFIX + '/debates/style_scaled.csv', index_col=0)

In [None]:
s3 = pd.concat([style_data, df], axis=1)

In [None]:
s3.to_csv(PREFIX + '/debates/style_quality_scaled.csv')

In [None]:
s3

## Speaker Features

In [38]:
import json
# Download from https://www.cs.cornell.edu/~esindurmus/ddo.html
user_info = json.load(open(PREFIX + '/users.json'))

debate_data = pd.read_json(PREFIX + '/debates_filtered_by_side.json')
debate_data.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/users.json'

In [None]:
user_rows = []
for _, row in debate_data.iterrows():
    if row['name']  in user_info:
        cur_info = user_info[row['name']]
        
        issue_pairs = sorted(cur_info['big_issues_dict'].items())
        issue_vec = [1 if pos == 'Pro' else -1 if pos == 'Con' else 0 for _, pos in issue_pairs]
        person_vec = {'political_ideology': cur_info.get('political_ideology'),
                       'religious_ideology': cur_info.get('religious_ideology'), 
                       'pol_party': cur_info.get('party')}
        user_rows.append((issue_vec, person_vec))
    else:
        # Add blanks
        person_vec = {'political_ideology': None,
                       'religious_ideology': None, 
                       'pol_party': None}
        
        user_rows.append(([0] * 48, person_vec))

In [None]:
# One hot encode the categorial features
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()

vals = [v[1] for v in user_rows]

new_vals = dv.fit_transform(vals)

In [None]:
import numpy as np
# Just normally extract the Big Issue features
p1 = np.array([v[0] for v in user_rows])

In [None]:
issue_names = [x[0] for x in issue_pairs]

In [None]:
all_user_info = np.concatenate([p1, new_vals.A], axis=1)

In [None]:
all_columns = issue_names + dv.feature_names_

In [None]:
all_user_features = pd.DataFrame(all_user_info, columns=all_columns)

In [None]:
all_user_features.drop('pol_party', axis=1, inplace=True)
all_user_features.drop('religious_ideology', axis=1, inplace=True)
all_user_features.drop('political_ideology', axis=1, inplace=True)


In [None]:
all_user_features['debate_side_id'] = debate_data.debate_side_id

In [None]:
all_user_features.set_index('debate_side_id', inplace=True)

In [None]:
all_user_features.to_csv(PREFIX + '/debates/full_speaker.csv')

In [None]:
issues_only = all_user_features[all_user_features.columns[:48]]

In [None]:
issues_only.to_csv(PREFIX + '/debates/issues_speaker.csv')

In [None]:
from sklearn.preprocessing import StandardScaler
X2 = StandardScaler().fit_transform(issues_only.values)


In [None]:
issues_only_scaled = pd.DataFrame(X2, index=issues_only.index, columns=issues_only.columns)

In [None]:
issues_only_scaled.to_csv(PREFIX + '/debates/issues_speaker_scaled.csv')