In [None]:
import sys
sys.path.append("..")

%load_ext autoreload
%autoreload 2

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from irt_lib.data_helper import create_full_data

from irt_lib.helpers import split_by_doc_id, do_metrics, run_full_cv

from lirt_ib.models import  IdealNet

In [None]:
PREFIX = "../data/debates/"

# Style and Quality

In [None]:
style_data = create_full_data(feature_types=['style_quality_scaled'], label_type='bin_points', base_path=PREFIX)

In [None]:
style_data.head()

In [None]:
D_in = len(style_data.iloc[0].feats)
num_users = style_data.user_id.nunique()
model_arguments = {'D_in': D_in, 'num_users': num_users, 'use_popularity': True}
model_cls = IdealNet

In [None]:
final_results = {}

In [None]:
for C in [1e-3, 1e-4, 1e-5, 1e-6]:
    for reg_type in ['l1', 'l2']:
        for learning_rate in [0.01, 0.005]:
            model_arguments['C'] = C
            model_arguments['reg_type'] = reg_type
            train_arguments['learning_rate'] = learning_rate
            train_arguments['num_train_epochs'] = 20
            results = run_full_cv(style_data, model_cls, model_arguments, train_arguments, averaged=True)
            
            final_results[(C, reg_type, learning_rate)] = results
            
            raise ValueError

In [None]:
final_results

# Speaker Only

In [None]:
style_data = create_full_data(feature_types=['issues_speaker'], label_type='bin_points', base_path=PREFIX)

In [None]:
final_results = {}

In [None]:

D_in = len(style_data.iloc[0].feats)
num_users = style_data.user_id.nunique()
model_arguments = {'D_in': D_in, 'num_users': num_users, 'use_popularity': True}
model_cls = IdealNet

In [None]:
train_arguments = {}
for C in [1e-4, 1e-5, 1e-6]:
    for reg_type in ['l1', 'l2']:
        for learning_rate in [0.1, 0.01, 0.005]:
            with open('log_file', 'a') as log_file:
                log_file.write(f"Starting {(C, reg_type, learning_rate)}\n")
            model_arguments['C'] = C
            model_arguments['reg_type'] = reg_type
            train_arguments['learning_rate'] = learning_rate
            train_arguments['num_train_epochs'] = 20
            results = run_full_cv(style_data, model_cls, model_arguments, train_arguments, averaged=True)
            
            final_results[(C, reg_type, learning_rate)] = results
            with open('log_file', 'a') as log_file:
                log_file.write(f"Results {str(results)}\n\n")
                

In [None]:
max(v['eval_accuracy'] for v in final_results.values())

# Style Data and Speaker 

In [None]:
style_data = create_full_data(feature_types=['style_quality_scaled', 'issues_speaker'], label_type='bin_points', base_path=PREFIX)

In [None]:
len(style_data.iloc[0].feats)

In [None]:
final_results_v2 = {}

In [None]:
D_in = len(style_data.iloc[0].feats)
num_users = style_data.user_id.nunique()
model_arguments = {'D_in': D_in, 'num_users': num_users, 'use_popularity': True}


In [None]:
model_cls = IdealNet

In [None]:
train_arguments = {}
for C in [1e-4, 1e-5, 1e-6]:
    for reg_type in ['l1', 'l2']:
        for learning_rate in [0.1, 0.01, 0.005]:
            with open('log_file', 'a') as log_file:
                log_file.write(f"Starting {(C, reg_type, learning_rate)}\n")
            model_arguments['C'] = C
            model_arguments['reg_type'] = reg_type
            train_arguments['learning_rate'] = learning_rate
            train_arguments['num_train_epochs'] = 20
            results = run_full_cv(style_data, model_cls, model_arguments, train_arguments, averaged=True)
            
            final_results_v2[(C, reg_type, learning_rate)] = results
            with open('log_file', 'a') as log_file:
                log_file.write(f"Results {str(results)}\n\n")
                

In [None]:
import pickle
pickle.dump(final_results_v2, open('debate_speaker_style.pkl', 'wb'))

In [None]:
final_results_v2

### No popularity

In [None]:
D_in = len(style_data.iloc[0].feats)
num_users = style_data.user_id.nunique()
model_arguments = {'D_in': D_in, 'num_users': num_users, 'use_popularity': False}


In [None]:
train_arguments = {}
for C in [1e-4, 1e-5, 1e-6]:
    for reg_type in ['l1', 'l2']:
        for learning_rate in [0.1, 0.01, 0.005]:
            
            model_arguments['C'] = C
            model_arguments['reg_type'] = reg_type
            train_arguments['learning_rate'] = learning_rate
            train_arguments['num_train_epochs'] = 20
            results = run_full_cv(style_data, model_cls, model_arguments, train_arguments, averaged=True)
            
            final_results_v2[(C, reg_type, learning_rate)] = results

In [None]:
for k, v in final_results_v2.items():
    print(k, v['eval_accuracy'])

In [None]:
max([v.get('eval_accuracy') for v in final_results_v2.values()])

In [None]:
model_arguments

## Text + Style

In [None]:
style_data = create_full_data(feature_types=['text_bin_lemma', 'style_quality_scaled'], label_type='bin_points', base_path=PREFIX)

In [None]:
D_in = len(style_data.iloc[0].feats)
num_users = style_data.user_id.nunique()

In [None]:
model_arguments = {'D_in': D_in, 'num_users': num_users}
train_arguments = {'num_train_epochs': 200, 'learning_rate': 0.001}

In [None]:
model_cls = IRTNet

In [None]:
final_results = {}

In [None]:
for C in [1e-4, 1e-5]:
    for reg_type in ['l1', 'l2']:
        for learning_rate in [0.01, 0.005]:
            model_arguments['C'] = C
            model_arguments['reg_type'] = reg_type
            train_arguments['learning_rate'] = learning_rate
            train_arguments['num_train_epochs'] = 20
            results = run_full_cv(style_data, model_cls, model_arguments, train_arguments, averaged=True)
            
            final_results[(C, reg_type, learning_rate)] = results

In [None]:
final_results

# Model Embeddings Review

Analyze embeddings from Style+Speaker Model

In [None]:
# Get feature names

import pandas as pd
p1 = pd.read_csv(PREFIX + '/style_quality_scaled.csv').columns[1:].tolist()
p2 = pd.read_csv(PREFIX + '/issues_speaker.csv').columns[1:].tolist()

feat_names = p1 + p2

In [None]:
style_data = create_full_data(feature_types=['style_quality_scaled', 'issues_speaker'], label_type='bin_points', base_path=PREFIX)


In [None]:
from transformers import Trainer, TrainingArguments

D_in = len(style_data.iloc[0].feats)
num_users = style_data.user_id.nunique()
model_arguments = {'D_in': D_in, 'num_users': num_users, 'use_popularity': True, 'reg_type': 'l1'}

model = IdealNet(**model_arguments)

args = TrainingArguments(num_train_epochs=20, output_dir="../../../../tmp", learning_rate=0.01, disable_tqdm=False, logging_steps=1000)

train_data = style_data.to_dict(orient='records')

trainer = Trainer(model=model, train_dataset=train_data, args=args, eval_dataset=train_data)
trainer.train()

In [None]:
W = model.popularity.weight.cpu().detach().numpy()[0]

W2 = model.polarity.weight.cpu().detach().numpy()[0]

In [None]:
print(sorted(zip(W, feat_names))[:6])
print('----')
print(sorted(zip(W, feat_names))[-6:])

In [None]:
print(sorted(zip(W2, feat_names))[:6])
print('----')
print(sorted(zip(W2, feat_names))[-6:])

In [None]:
import matplotlib.pyplot as plt


newp = [(a,b) for a, b in zip(W, W2)]
plt.scatter(*zip(*newp), alpha=0.5)
plt.xlabel('Weighs for Polarity')
plt.ylabel('Weights for Popularity')


Construct user embeddings

In [None]:
import json
people = json.load(open('users.json'))

id_map_data = pd.read_csv('/home/ec2-user/final_paper_data_v2/debate_voter_data.csv')

In [None]:
U = model.users.weight.cpu().detach().numpy().T[0]

In [None]:
from collections import defaultdict
party_ideals = defaultdict(list)
for name, idx in id_map_data.groupby(['voter_name', 'voter_id']).first().index.tolist():
    if name in people:
        party = people[name]['political_ideology']
        weight = U[idx]
        party_ideals[party].append(weight)

In [None]:
alld = sorted(party_ideals.items())
labels = [x[0] for x in alld if len(x[1]) > 50]
points = [x[1] for x in alld if len(x[1]) > 50]
plt.boxplot(points, labels=labels, showfliers=False)
plt.xticks(rotation = 45) # Rotates X-Axis Ticks by 45-degrees
