In [1]:
import pickle
import pandas as pd
from pandas import Series
from tqdm import tqdm
import seaborn as sns
import numpy as np
import matplotlib

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import font_manager as fm
from matplotlib import rc


matplotlib.rcParams['figure.dpi'] = 150
f_path = "/System/Library/Fonts/Helvetica.ttc"
font_name = fm.FontProperties(fname=f_path).get_name()
rc('font', family=font_name, size=13)

# Generation of Belief triplets

### Load dataset

In [45]:
df = pd.read_pickle('../dataset/01_Final_dataframe/df_ddo_including_only_truebeliefs_nodup(N192307).p')

In [46]:
df.head()

Unnamed: 0,debate_key,debate_title,username,debate_date,position,is_belief,belief_statement
1,.-Audis-are-junkers-except-to-rich-kids-with-l...,". Audis are junkers, except to rich kids with ...",Max.Wallace,2014-09-04,Pro,1.0,I agree with the following: . Audis are junker...
2,....-Former-Secretary-of-State-Madeleine-Albri...,"....""Former Secretary of State Madeleine Albri...",Lookingatissues,2017-01-30,Pro,1.0,"I agree with the following: ....""Former Secret..."
3,...Words-can-t-hurt-me-any./1/,...Words can't hurt me any.,NonInDelicto,2007-12-19,Pro,1.0,I agree with the following: ...Words can't hur...
4,.9-repeated-is-equal-to-1./1/,.9 repeated is equal to 1.,cowpie1998,2011-04-07,Pro,1.0,I agree with the following: .9 repeated is equ...
5,.99-is-equal-to-one./1/,.99 is equal to one.,SweetCrackerJack,2013-12-24,Pro,1.0,I agree with the following: .99 is equal to one.


### Basic statistics

In [4]:
print('data size:', len(df))
print('num debates:', len(df['debate_key'].unique()))
print('num debate title:', len(df['debate_title'].unique()))
print('num users:', len(df['username'].unique()))
print('average participation:', len(df)/len(df['username'].unique())  )
print('Unique beliefs', len(df['belief_statement'].unique()))

data size: 192307
num debates: 65861
num debate title: 59986
num users: 40280
average participation: 4.7742552135054614
Unique beliefs 119972


### 5-fold dataset 

 * From the original dataframe we generate 5-fold dataset based on debate titles

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [18]:
debate_titles = df.debate_title.unique()
print(f"There are {len(debate_titles)} unique debates in debate.org dataset")

There are 59986 unique debates in debate.org dataset


In [19]:
kf = KFold(n_splits=5)
kf.get_n_splits(debate_titles)

5

In [20]:
train_indices = []
test_indices  = []

for i, (train_index, test_index) in enumerate(kf.split(debate_titles)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")  
    print(f"  Test:  index={test_index}")    
    print(len(train_index), len(test_index))

    train_indices.append(train_index)
    test_indices.append(test_index)

Fold 0:
  Train: index=[11998 11999 12000 ... 59983 59984 59985]
  Test:  index=[    0     1     2 ... 11995 11996 11997]
47988 11998
Fold 1:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[11998 11999 12000 ... 23992 23993 23994]
47989 11997
Fold 2:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[23995 23996 23997 ... 35989 35990 35991]
47989 11997
Fold 3:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[35992 35993 35994 ... 47986 47987 47988]
47989 11997
Fold 4:
  Train: index=[    0     1     2 ... 47986 47987 47988]
  Test:  index=[47989 47990 47991 ... 59983 59984 59985]
47989 11997


In [21]:
for i in range(5):
    
    train_titles = debate_titles[train_indices[i]]
    test_titles = debate_titles[test_indices[i]]
    df_train = df[df['debate_title'].isin(train_titles)]
    df_test  = df[df['debate_title'].isin(test_titles)]
        
    df_train.to_pickle('../dataset/04_K-fold_dataset/df_train_idx%d.p'%(i))
    df_test.to_pickle('../dataset/04_K-fold_dataset/df_test_idx%d.p'%(i))
    
    print("Train size:",len(df_train), "Test size:",len(df_test))

Train size: 153698 Test size: 38609
Train size: 151618 Test size: 40689
Train size: 151524 Test size: 40783
Train size: 157602 Test size: 34705
Train size: 154786 Test size: 37521


## Generation of the Triplet dataset

In [23]:
from collections import defaultdict 
from itertools import combinations

In [26]:
pro_phrase = 'I agree with the following: '
con_phrase = 'I disagree with the following: '

def get_reverse_phrase(phrase):
    if phrase == pro_phrase:
        return con_phrase
    elif phrase == con_phrase:
        return pro_phrase
    else: 
        print('error')
        
def get_opposite_belief(belief_statement):
    position = " ".join(belief_statement.split()[:5]) + ' '
    title = " ".join(belief_statement.split()[5:]) 
    
    position_r = get_reverse_phrase(position)
    opposite_belief = position_r + title
    return opposite_belief        

In [32]:
#example 
get_opposite_belief('I agree with the following: apples are delicious.')

'I disagree with the following: apples are delicious.'

In [33]:
#get a belief co-occurrence dictionary
def get_belief_cooccurrence_dic(df):
    
    df_g = df.groupby('username')
    corpus = []

    for g, data in df_g:

        data = data.sort_values(by='debate_date')
        user_beliefs = list(data['belief_statement'].unique())
        corpus.append(user_beliefs)
        
    
    belief2list = defaultdict(list)
    
    for b_list in corpus:
        if len(b_list) == 1: 
            continue

        for e1 in b_list:
            for e2 in b_list:
                if e1 != e2:
                    belief2list[e1].append(e2)
                    
    return belief2list

In [34]:
dic_co = get_belief_cooccurrence_dic(df)

In [35]:
len(dic_co)

102881

In [36]:
#Get triplets using the belief co-occurrence dictionary
def get_stance_triplet(belief2list):
    
    belief_triplet = []

    for s in tqdm(belief2list):

        anchor = s
        positive_samples = belief2list[s] 
        opposite_belief = get_opposite_belief(s)

        if not opposite_belief in belief2list: #use only direct opposite stance as a negative sample
            negative_samples = [opposite_belief]
        else:
            negative_samples = [opposite_belief] + belief2list[opposite_belief]

        #if vote history is too long: Sample 5 stances from history 
        thres = 5
        if len(positive_samples) > thres-1:
            positive_samples = np.random.choice(positive_samples, size=thres, replace=False)

        if len(negative_samples) > thres-1:
            #to ensure including directly opposite stance
            other_samples = np.random.choice(negative_samples[1:], size=thres-1, replace=False)        
            negative_samples = np.concatenate((negative_samples[:1], other_samples)) 

        #make triplet examples 
        for pos in positive_samples:
            for neg in negative_samples:
                example = [anchor, pos, neg]
                belief_triplet.append(example)
    
    return belief_triplet

In [37]:
for i in tqdm(range(5)):
    df_train = pd.read_pickle('../dataset/04_K-fold_dataset/df_train_idx%d.p'%(i))
    df_test = pd.read_pickle('../dataset/04_K-fold_dataset/df_test_idx%d.p'%(i))

    belief2list_train = get_belief_cooccurrence_dic(df_train)
    belief2list_test  = get_belief_cooccurrence_dic(df_test)
    
    train_triplets = get_stance_triplet(belief2list_train)
    test_triplets  = get_stance_triplet(belief2list_test)
    
    with open('../dataset/04_K-fold_triplets/train_triplet_idx%d.p'%i,'wb') as f:
        pickle.dump(train_triplets, f)
        
    with open('../dataset/04_K-fold_triplets/test_triplet_idx%d.p'%i,'wb') as f:
        pickle.dump(test_triplets, f)

  0%|                                                     | 0/5 [00:00<?, ?it/s]
  0%|                                                 | 0/80441 [00:00<?, ?it/s][A
  0%|▏                                    | 374/80441 [00:00<00:21, 3736.64it/s][A
  1%|▎                                    | 748/80441 [00:00<00:22, 3611.45it/s][A
  1%|▍                                   | 1110/80441 [00:00<00:22, 3543.83it/s][A
  2%|▋                                   | 1633/80441 [00:00<00:18, 4192.04it/s][A
  3%|█▏                                  | 2593/80441 [00:00<00:12, 6055.18it/s][A
  5%|█▋                                  | 3710/80441 [00:00<00:09, 7761.23it/s][A
  6%|██▏                                 | 4807/80441 [00:00<00:08, 8796.11it/s][A
  7%|██▌                                 | 5690/80441 [00:01<00:13, 5503.04it/s][A
  8%|███                                 | 6716/80441 [00:01<00:11, 6548.37it/s][A
 10%|███▌                                | 7902/80441 [00:01<00:09, 7812.14it/s

  2%|▋                                   | 1664/81503 [00:00<00:19, 4173.07it/s][A
  3%|█                                   | 2490/81503 [00:00<00:14, 5587.30it/s][A
  4%|█▌                                  | 3491/81503 [00:00<00:11, 7049.00it/s][A
  5%|█▉                                  | 4412/81503 [00:00<00:09, 7738.34it/s][A
  7%|██▍                                 | 5450/81503 [00:00<00:08, 8566.00it/s][A
  8%|██▊                                 | 6491/81503 [00:00<00:08, 9130.83it/s][A
  9%|███▎                                | 7475/81503 [00:01<00:07, 9346.67it/s][A
 10%|███▊                                | 8554/81503 [00:01<00:07, 9776.38it/s][A
 12%|████▏                               | 9535/81503 [00:01<00:07, 9635.29it/s][A
 13%|████▍                             | 10759/81503 [00:01<00:06, 10411.16it/s][A
 15%|████▉                             | 11829/81503 [00:01<00:06, 10496.07it/s][A
 16%|█████▎                            | 12881/81503 [00:01<00:06, 10168.23i

In [39]:
"#triplets ~",len(train_triplets)

('#triplets ~', 1343346)

In [40]:
numtrip_train = []

for i in range(5):
    with open('../dataset/04_K-fold_triplets/train_triplet_idx%d.p'%i,'rb') as f:
        dat = pickle.load(f)
    print("train", i, len(dat))
    
    numtrip_train.append(len(dat))
    
numtrip_test = []

for i in range(5):
    with open('../dataset/04_K-fold_triplets/test_triplet_idx%d.p'%i,'rb') as f:
        dat = pickle.load(f)
    print("test", i, len(dat))
    
    numtrip_test.append(len(dat))    

train 0 1351986
train 1 1355004
train 2 1328074
train 3 1412283
train 4 1343346
test 0 233404
test 1 229331
test 2 267252
test 3 181810
test 4 243191


In [41]:
print("average train/test set size")
np.average(numtrip_train), np.average(numtrip_test)

average train/test set size


(1358138.6, 230997.6)

### Train / Test set with commonly appearing users

In [43]:
train_sets = []
test_sets  = []

BASE_PATH = '../dataset/04_K-fold_dataset/'

for i in range(5):    
    
    df_train = pd.read_pickle(BASE_PATH + 'df_train_idx%d.p'%(i))
    df_test  = pd.read_pickle(BASE_PATH + 'df_test_idx%d.p'%(i))
        
    train_users = df_train.username.unique()
    test_users  = df_test.username.unique()
    
    common_users = []
    for u in test_users:
        if u in train_users:
            common_users.append(u)
    
    df_train_common = df_train[df_train['username'].isin(common_users)]
    df_test_common  = df_test[df_test['username'].isin(common_users)]
    
    
    train_sets.append(df_train_common)
    test_sets.append(df_test_common)
    
    
    print("# votes: Train, Train_common, Test, Test_common")
    print(len(df_train), len(df_train_common), len(df_test), len(df_test_common))
    print("# voters: Train, Train_common, Test, Test_common")
    print(len(df_train.username.unique()), len(df_train_common.username.unique()), len(df_test.username.unique()), len(df_test_common.username.unique()))
    print()

# votes: Train, Train_common, Test, Test_common
153698 111724 38609 33208
# voters: Train, Train_common, Test, Test_common
35447 10173 15006 10173

# votes: Train, Train_common, Test, Test_common
151618 110860 40689 35171
# voters: Train, Train_common, Test, Test_common
35362 10401 15319 10401

# votes: Train, Train_common, Test, Test_common
151524 107163 40783 36060
# voters: Train, Train_common, Test, Test_common
36092 9706 13894 9706

# votes: Train, Train_common, Test, Test_common
157602 113625 34705 27733
# voters: Train, Train_common, Test, Test_common
34205 9675 15750 9675

# votes: Train, Train_common, Test, Test_common
154786 110227 37521 33007
# voters: Train, Train_common, Test, Test_common
36220 9744 13804 9744



In [44]:
BASE_PATH = '../dataset/04_K-fold_dataset_commonusers/'

for i in range(5):    
    
    train_sets[i].to_pickle(BASE_PATH + 'df_commonuser_train_idx%d.p'%(i))
    test_sets[i].to_pickle(BASE_PATH + 'df_commonuser_test_idx%d.p'%(i))