In [22]:
%matplotlib inline

import pickle
import pandas as pd
from pandas import Series
from tqdm import tqdm
import seaborn as sns
import numpy as np
import matplotlib
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from collections import defaultdict 
from itertools import combinations


import torch
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import InputExample
from sentence_transformers import losses
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import TripletEvaluator

## Generation of belief triplets

### Read dataset 

In [4]:
df = pd.read_pickle('../dataset/01_Final_dataframe/df_ddo_including_only_truebeliefs_nodup(N192307).p')

In [6]:
df.head()

Unnamed: 0,debate_key,debate_title,username,debate_date,position,is_belief,belief_statement
1,.-Audis-are-junkers-except-to-rich-kids-with-l...,". Audis are junkers, except to rich kids with ...",Max.Wallace,2014-09-04,Pro,1.0,I agree with the following: . Audis are junker...
2,....-Former-Secretary-of-State-Madeleine-Albri...,"....""Former Secretary of State Madeleine Albri...",Lookingatissues,2017-01-30,Pro,1.0,"I agree with the following: ....""Former Secret..."
3,...Words-can-t-hurt-me-any./1/,...Words can't hurt me any.,NonInDelicto,2007-12-19,Pro,1.0,I agree with the following: ...Words can't hur...
4,.9-repeated-is-equal-to-1./1/,.9 repeated is equal to 1.,cowpie1998,2011-04-07,Pro,1.0,I agree with the following: .9 repeated is equ...
5,.99-is-equal-to-one./1/,.99 is equal to one.,SweetCrackerJack,2013-12-24,Pro,1.0,I agree with the following: .99 is equal to one.


In [5]:
print('data size:', len(df))
print('num debates:', len(df['debate_key'].unique()))
print('num debate title:', len(df['debate_title'].unique()))
print('num users:', len(df['username'].unique()))
print('average participation:', len(df)/len(df['username'].unique())  )
print('Unique beliefs', len(df['belief_statement'].unique()))

data size: 192307
num debates: 65861
num debate title: 59986
num users: 40280
average participation: 4.7742552135054614
Unique beliefs 119972


### Generate 5-fold dataset

In [8]:
debate_titles = df.debate_title.unique()
print(f"There are {len(debate_titles)} unique debates in debate.org dataset")

There are 59986 unique debates in debate.org dataset


In [9]:
kf = KFold(n_splits=5)
kf.get_n_splits(debate_titles)

5

In [10]:
train_indices = []
test_indices  = []

for i, (train_index, test_index) in enumerate(kf.split(debate_titles)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")  
    print(f"  Test:  index={test_index}")    
    print(len(train_index), len(test_index))

    train_indices.append(train_index)
    test_indices.append(test_index)

Fold 0:
  Train: index=[11998 11999 12000 ... 59983 59984 59985]
  Test:  index=[    0     1     2 ... 11995 11996 11997]
47988 11998
Fold 1:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[11998 11999 12000 ... 23992 23993 23994]
47989 11997
Fold 2:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[23995 23996 23997 ... 35989 35990 35991]
47989 11997
Fold 3:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[35992 35993 35994 ... 47986 47987 47988]
47989 11997
Fold 4:
  Train: index=[    0     1     2 ... 47986 47987 47988]
  Test:  index=[47989 47990 47991 ... 59983 59984 59985]
47989 11997


In [None]:
for i in range(5):
    
    train_titles = debate_titles[train_indices[i]]
    test_titles = debate_titles[test_indices[i]]
    df_train = df[df['debate_title'].isin(train_titles)]
    df_test  = df[df['debate_title'].isin(test_titles)]
        
    df_train.to_pickle('../dataset/04_K-fold_dataset/df_train_idx%d.p'%(i))
    df_test.to_pickle('../dataset/04_K-fold_dataset/df_test_idx%d.p'%(i))
    
    #print("Train size:",len(df_train), "Test size:",len(df_test))

### Generate triplets

In [12]:
pro_phrase = 'I agree with the following: '
con_phrase = 'I disagree with the following: '

def get_reverse_phrase(phrase):
    if phrase == pro_phrase:
        return con_phrase
    elif phrase == con_phrase:
        return pro_phrase
    else: 
        print('error')
        
def get_opposite_belief(belief_statement):
    position = " ".join(belief_statement.split()[:5]) + ' '
    title = " ".join(belief_statement.split()[5:]) 
    
    position_r = get_reverse_phrase(position)
    opposite_belief = position_r + title
    return opposite_belief        

In [13]:
#example 
get_opposite_belief('I agree with the following: apples are delicious.')

'I disagree with the following: apples are delicious.'

In [14]:
#get a belief co-occurrence dictionary
def get_belief_cooccurrence_dic(df):
    
    df_g = df.groupby('username')
    corpus = []

    for g, data in df_g:

        data = data.sort_values(by='debate_date')
        user_beliefs = list(data['belief_statement'].unique())
        corpus.append(user_beliefs)
        
    
    belief2list = defaultdict(list)
    
    for b_list in corpus:
        if len(b_list) == 1: 
            continue

        for e1 in b_list:
            for e2 in b_list:
                if e1 != e2:
                    belief2list[e1].append(e2)
                    
    return belief2list

In [15]:
dic_co = get_belief_cooccurrence_dic(df)

In [16]:
len(dic_co)

102881

In [17]:
#Get triplets using the belief co-occurrence dictionary
def get_stance_triplet(belief2list):
    
    belief_triplet = []

    for s in tqdm(belief2list):

        anchor = s
        positive_samples = belief2list[s] 
        opposite_belief = get_opposite_belief(s)

        if not opposite_belief in belief2list: #use only direct opposite stance as a negative sample
            negative_samples = [opposite_belief]
        else:
            negative_samples = [opposite_belief] + belief2list[opposite_belief]

        #if vote history is too long: Sample 5 stances from history 
        thres = 5
        if len(positive_samples) > thres-1:
            positive_samples = np.random.choice(positive_samples, size=thres, replace=False)

        if len(negative_samples) > thres-1:
            #to ensure including directly opposite stance
            other_samples = np.random.choice(negative_samples[1:], size=thres-1, replace=False)        
            negative_samples = np.concatenate((negative_samples[:1], other_samples)) 

        #make triplet examples 
        for pos in positive_samples:
            for neg in negative_samples:
                example = [anchor, pos, neg]
                belief_triplet.append(example)
    
    return belief_triplet

In [None]:
for i in tqdm(range(5)):
    df_train = pd.read_pickle('../dataset/04_K-fold_dataset/df_train_idx%d.p'%(i))
    df_test = pd.read_pickle('../dataset/04_K-fold_dataset/df_test_idx%d.p'%(i))

    belief2list_train = get_belief_cooccurrence_dic(df_train)
    belief2list_test  = get_belief_cooccurrence_dic(df_test)
    
    train_triplets = get_stance_triplet(belief2list_train)
    test_triplets  = get_stance_triplet(belief2list_test)
    
    with open('../dataset/04_K-fold_triplets/train_triplet_idx%d.p'%i,'wb') as f:
        pickle.dump(train_triplets, f)
        
    with open('../dataset/04_K-fold_triplets/test_triplet_idx%d.p'%i,'wb') as f:
        pickle.dump(test_triplets, f)

## Train / Test set with commonly appearing users

In [21]:
train_sets = []
test_sets  = []

BASE_PATH = '../dataset/04_K-fold_dataset/'

for i in range(5):    
    
    df_train = pd.read_pickle(BASE_PATH + 'df_train_idx%d.p'%(i))
    df_test  = pd.read_pickle(BASE_PATH + 'df_test_idx%d.p'%(i))
        
    train_users = df_train.username.unique()
    test_users  = df_test.username.unique()
    
    common_users = []
    for u in test_users:
        if u in train_users:
            common_users.append(u)
    
    df_train_common = df_train[df_train['username'].isin(common_users)]
    df_test_common  = df_test[df_test['username'].isin(common_users)]
    
    
    train_sets.append(df_train_common)
    test_sets.append(df_test_common)
    
    print("Dataset idx:%d"%(i))
    print("# votes: Train, Train_common, Test, Test_common")
    print(len(df_train), len(df_train_common), len(df_test), len(df_test_common))
    print("# voters: Train, Train_common, Test, Test_common")
    print(len(df_train.username.unique()), len(df_train_common.username.unique()), len(df_test.username.unique()), len(df_test_common.username.unique()))
    print()

Dataset idx:0
# votes: Train, Train_common, Test, Test_common
153698 111724 38609 33208
# voters: Train, Train_common, Test, Test_common
35447 10173 15006 10173

Dataset idx:1
# votes: Train, Train_common, Test, Test_common
151618 110860 40689 35171
# voters: Train, Train_common, Test, Test_common
35362 10401 15319 10401

Dataset idx:2
# votes: Train, Train_common, Test, Test_common
151524 107163 40783 36060
# voters: Train, Train_common, Test, Test_common
36092 9706 13894 9706

Dataset idx:3
# votes: Train, Train_common, Test, Test_common
157602 113625 34705 27733
# voters: Train, Train_common, Test, Test_common
34205 9675 15750 9675

Dataset idx:4
# votes: Train, Train_common, Test, Test_common
154786 110227 37521 33007
# voters: Train, Train_common, Test, Test_common
36220 9744 13804 9744



In [None]:
BASE_PATH = '../dataset/04_K-fold_dataset_commonusers/'

for i in range(5):    
    
    train_sets[i].to_pickle(BASE_PATH + 'df_commonuser_train_idx%d.p'%(i))
    test_sets[i].to_pickle(BASE_PATH + 'df_commonuser_test_idx%d.p'%(i))

## Fine-tuing Sentence-BERT model

### Training S-BERT

In [None]:
for data_index in range(5): #iteration over K-fold 

    #Load dataset 
    filepath = '../dataset/04_K-fold_triplets/train_triplet_idx%d.p'%(data_index) 

    with open(filepath,'rb') as f:
        triplet_data = pickle.load(f)

    #Make InputExamples to use it as input for Data loader 
    triplets = []
    for e in tqdm(triplet_data):
        triplets.append(InputExample(texts = e))

    #Data Loader 
    batch_size = 32
    loader = DataLoader(triplets, shuffle=True, batch_size=batch_size)    
   
    num_epochs = 5
    for epoch in range(num_epochs):
        
        print("Dataidx: %d, epoch: %d"%(data_index, epoch))
        
        if epoch == 0:
            model = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')        
        else:
            model = SentenceTransformer('../model/roberta-base_idx%d_epoch%d'%(data_index, epoch))
            
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        model.to(device)

        loss = losses.TripletLoss(model)
        savepath = '../model/roberta-base_idx%d_epoch%d'%(data_index, epoch+1)
        
        model.fit(
            train_objectives=[(loader, loss)],
            epochs=1,
            output_path=savepath,
            show_progress_bar=True
        )


### Training original-BERT

In [None]:
for data_index in range(5): 

    #Load dataset 
    filepath = '../dataset/04_K-fold_triplets/train_triplet_idx%d.p'%(data_index) 

    with open(filepath,'rb') as f:
        triplet_data = pickle.load(f)

    print(f"Total # tripliets : {len(triplet_data)}")

    
    #Make InputExamples to use it as input for Data loader 
    triplets = []
    for e in tqdm(triplet_data):
        triplets.append(InputExample(texts = e))

    #Data Loader 
    batch_size = 32
    loader = DataLoader(triplets, shuffle=True, batch_size=batch_size)    


    #Model preparation - BERT 
    bert = models.Transformer('bert-base-uncased')

    pooler = models.Pooling(
                bert.get_word_embedding_dimension(), #768
                    pooling_mode_mean_tokens=True #mean pooling
                    )
    model = SentenceTransformer(modules=[bert, pooler])

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print("device: ", model.device)


    #Train model 
    loss = losses.TripletLoss(model)
    epochs = 5
    
    for epoch in range(epochs):
        
        savepath = '../model/finetuned-BERT_idx%d_epoch%d'%(data_index, epoch+1)
        
        model.fit(
            train_objectives=[(loader, loss)],
            epochs=1,
            output_path=savepath,
            show_progress_bar=True
        )

# 02 Evaluation 