In [1]:
import pickle
import pandas as pd
from pandas import Series
from tqdm import tqdm
import seaborn as sns
import numpy as np

import matplotlib
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import font_manager as fm
from matplotlib import rc


matplotlib.rcParams['figure.dpi'] = 150
f_path = "/System/Library/Fonts/Helvetica.ttc"
font_name = fm.FontProperties(fname=f_path).get_name()
rc('font', family=font_name, size=13)

In [2]:
df = pd.read_pickle('../dataset/03_Final_dataframe/df_ddo_including_only_truebeliefs_nodup(N192307).p')

In [3]:
df.head()

Unnamed: 0,debate_key,debate_title,username,debate_date,position,is_belief,belief_statement
1,.-Audis-are-junkers-except-to-rich-kids-with-l...,". Audis are junkers, except to rich kids with ...",Max.Wallace,2014-09-04,Pro,1.0,I agree with the following: . Audis are junker...
2,....-Former-Secretary-of-State-Madeleine-Albri...,"....""Former Secretary of State Madeleine Albri...",Lookingatissues,2017-01-30,Pro,1.0,"I agree with the following: ....""Former Secret..."
3,...Words-can-t-hurt-me-any./1/,...Words can't hurt me any.,NonInDelicto,2007-12-19,Pro,1.0,I agree with the following: ...Words can't hur...
4,.9-repeated-is-equal-to-1./1/,.9 repeated is equal to 1.,cowpie1998,2011-04-07,Pro,1.0,I agree with the following: .9 repeated is equ...
5,.99-is-equal-to-one./1/,.99 is equal to one.,SweetCrackerJack,2013-12-24,Pro,1.0,I agree with the following: .99 is equal to one.


In [4]:
print('data size:', len(df))
print('num debates:', len(df['debate_key'].unique()))
print('num debate title:', len(df['debate_title'].unique()))
print('num users:', len(df['username'].unique()))
print('average participation:', len(df)/len(df['username'].unique())  )

data size: 192307
num debates: 65861
num debate title: 59986
num users: 40280
average participation: 4.7742552135054614


In [5]:
192307/40280

4.7742552135054614

In [6]:
len(df['belief_statement'].unique())

119972

## 5-fold dataset 

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
debate_titles = df.debate_title.unique()
print(f"There are {len(debate_titles)} unique debates in debate.org dataset")

There are 59986 unique debates in debate.org dataset


In [9]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
kf.get_n_splits(debate_titles)

5

In [10]:
train_indices = []
test_indices  = []

for i, (train_index, test_index) in enumerate(kf.split(debate_titles)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")  
    print(f"  Test:  index={test_index}")    
    print(len(train_index), len(test_index))

    train_indices.append(train_index)
    test_indices.append(test_index)

Fold 0:
  Train: index=[11998 11999 12000 ... 59983 59984 59985]
  Test:  index=[    0     1     2 ... 11995 11996 11997]
47988 11998
Fold 1:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[11998 11999 12000 ... 23992 23993 23994]
47989 11997
Fold 2:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[23995 23996 23997 ... 35989 35990 35991]
47989 11997
Fold 3:
  Train: index=[    0     1     2 ... 59983 59984 59985]
  Test:  index=[35992 35993 35994 ... 47986 47987 47988]
47989 11997
Fold 4:
  Train: index=[    0     1     2 ... 47986 47987 47988]
  Test:  index=[47989 47990 47991 ... 59983 59984 59985]
47989 11997


In [34]:
for i in range(5):
    
    train_titles = debate_titles[train_indices[i]]
    test_titles = debate_titles[test_indices[i]]
    df_train = df[df['debate_title'].isin(train_titles)]
    df_test  = df[df['debate_title'].isin(test_titles)]
        
    df_train.to_pickle('../dataset/04_K-fold_dataset/df_train_idx%d.p'%(i))
    df_test.to_pickle('../dataset/04_K-fold_dataset/df_test_idx%d.p'%(i))
    
    print("Train size:",len(df_train), "Test size:",len(df_test))

Train size: 153698 Test size: 38609
Train size: 151618 Test size: 40689
Train size: 151524 Test size: 40783
Train size: 157602 Test size: 34705
Train size: 154786 Test size: 37521


## Triplet dataset 생성

In [35]:
from collections import defaultdict 
from itertools import combinations

In [36]:
pro_phrase = 'I agree with the following: '
con_phrase = 'I disagree with the following: '

def get_reverse_phrase(phrase):
    if phrase == pro_phrase:
        return con_phrase
    elif phrase == con_phrase:
        return pro_phrase
    else: 
        print('error')

In [37]:
def get_opposite_belief(belief_statement):
    position = " ".join(belief_statement.split()[:5]) + ' '
    title = " ".join(belief_statement.split()[5:]) 
    
    position_r = get_reverse_phrase(position)
    opposite_belief = position_r + title
    return opposite_belief

In [38]:
get_opposite_belief('I disagree with the following: abc c c ')

'I agree with the following: abc c c'

In [39]:
def get_belief_cooccurrence_dic(df):
    
    df_g = df.groupby('username')
    corpus = []

    for g, data in df_g:

        data = data.sort_values(by='debate_date')
        user_beliefs = list(data['belief_statement'].unique())
        corpus.append(user_beliefs)
        
    
    #co-occuring stance dictionary
    belief2list = defaultdict(list)
    
    for b_list in corpus:
        if len(b_list) == 1: 
            continue

        for e1 in b_list:
            for e2 in b_list:
                if e1 != e2:
                    belief2list[e1].append(e2)
                    
    return belief2list

In [40]:
def get_stance_triplet(belief2list):
    
    belief_triplet = []

    for s in tqdm(belief2list):

        anchor = s
        positive_samples = belief2list[s] 
        opposite_belief = get_opposite_belief(s)

        if not opposite_belief in belief2list: #use only direct opposite stance as a negative sample
            negative_samples = [opposite_belief]
        else:
            negative_samples = [opposite_belief] + belief2list[opposite_belief]

        #if vote history is too long: Sample 5 stances from history 
        thres = 5
        if len(positive_samples) > thres-1:
            positive_samples = np.random.choice(positive_samples, size=thres, replace=False)

        if len(negative_samples) > thres-1:
            #to ensure including directly opposite stance
            other_samples = np.random.choice(negative_samples[1:], size=thres-1, replace=False)        
            negative_samples = np.concatenate((negative_samples[:1], other_samples)) 

        #make triplet examples 
        for pos in positive_samples:
            for neg in negative_samples:
                example = [anchor, pos, neg]
                belief_triplet.append(example)
    
    return belief_triplet

In [41]:
for i in tqdm(range(5)):
    df_train = pd.read_pickle('../dataset/04_K-fold_dataset/df_train_idx%d.p'%(i))
    df_test = pd.read_pickle('../dataset/04_K-fold_dataset/df_test_idx%d.p'%(i))

    belief2list_train = get_belief_cooccurrence_dic(df_train)
    belief2list_test  = get_belief_cooccurrence_dic(df_test)
    
    train_triplets = get_stance_triplet(belief2list_train)
    test_triplets  = get_stance_triplet(belief2list_test)
    
    with open('../dataset/04_K-fold_triplets/train_triplet_idx%d.p'%i,'wb') as f:
        pickle.dump(train_triplets, f)
        
    with open('../dataset/04_K-fold_triplets/test_triplet_idx%d.p'%i,'wb') as f:
        pickle.dump(test_triplets, f)

  0%|                                                                                                            | 0/5 [00:00<?, ?it/s]
  0%|                                                                                                        | 0/80441 [00:00<?, ?it/s][A
  0%|▍                                                                                           | 366/80441 [00:00<00:21, 3658.43it/s][A
  1%|▊                                                                                           | 732/80441 [00:00<00:22, 3573.77it/s][A
  1%|█▏                                                                                         | 1097/80441 [00:00<00:21, 3606.78it/s][A
  2%|█▉                                                                                         | 1761/80441 [00:00<00:16, 4793.48it/s][A
  3%|██▉                                                                                        | 2627/80441 [00:00<00:12, 6176.39it/s][A
  5%|████▎                    

100%|█████████████████████████████████████████████████████████████████████████████████████████| 80441/80441 [00:06<00:00, 12120.82it/s][A

  0%|                                                                                                        | 0/16939 [00:00<?, ?it/s][A
 14%|█████████████                                                                             | 2450/16939 [00:00<00:00, 24497.34it/s][A
 29%|██████████████████████████▎                                                                | 4900/16939 [00:00<00:01, 6240.92it/s][A
 48%|███████████████████████████████████████████▏                                              | 8133/16939 [00:00<00:00, 10839.93it/s][A
 70%|█████████████████████████████████████████████████████████████▉                           | 11788/16939 [00:00<00:00, 16011.82it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████| 16939/16939 [00:01<00:00, 16491.50it/s][A
 20%|████████████████████ 

 85%|████████████████████████████████████████████████████████████████████████████             | 68719/80404 [00:06<00:00, 13857.56it/s][A
 88%|██████████████████████████████████████████████████████████████████████████████           | 70477/80404 [00:06<00:00, 11458.91it/s][A
 91%|████████████████████████████████████████████████████████████████████████████████▋        | 72915/80404 [00:06<00:00, 14178.27it/s][A
 94%|███████████████████████████████████████████████████████████████████████████████████▎     | 75279/80404 [00:06<00:00, 16362.75it/s][A
 97%|█████████████████████████████████████████████████████████████████████████████████████▉   | 77636/80404 [00:06<00:00, 18152.35it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████| 80404/80404 [00:07<00:00, 11359.96it/s][A

  0%|                                                                                                        | 0/16925 [00:00<?, ?it/s][A
 14%|████████████▏        

 86%|████████████████████████████████████████████████████████████████████████████▍            | 68835/80113 [00:05<00:00, 19491.00it/s][A
 88%|██████████████████████████████████████████████████████████████████████████████▋          | 70881/80113 [00:06<00:00, 15844.94it/s][A
 92%|█████████████████████████████████████████████████████████████████████████████████▋       | 73545/80113 [00:06<00:00, 18481.30it/s][A
 95%|████████████████████████████████████████████████████████████████████████████████████▍    | 76040/80113 [00:06<00:00, 20154.80it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████| 80113/80113 [00:06<00:00, 12470.54it/s][A

  0%|                                                                                                        | 0/17750 [00:00<?, ?it/s][A
 10%|█████████▏                                                                                | 1801/17750 [00:00<00:00, 18006.58it/s][A
 23%|████████████████████▌

 66%|███████████████████████████████████████████████████████████▏                             | 54194/81503 [00:05<00:01, 16250.99it/s][A
 69%|████████████████████████████████████████████████████████████▉                            | 55854/81503 [00:05<00:01, 16196.46it/s][A
 71%|██████████████████████████████████████████████████████████████▊                          | 57572/81503 [00:05<00:01, 16481.53it/s][A
 73%|████████████████████████████████████████████████████████████████▉                        | 59519/81503 [00:05<00:01, 17356.68it/s][A
 75%|██████████████████████████████████████████████████████████████████▉                      | 61270/81503 [00:05<00:01, 16637.41it/s][A
 77%|████████████████████████████████████████████████████████████████████▋                    | 62950/81503 [00:05<00:01, 15504.71it/s][A
 80%|██████████████████████████████████████████████████████████████████████▊                  | 64805/81503 [00:05<00:01, 16344.67it/s][A
 82%|██████████████████████

 60%|█████████████████████████████████████████████████████                                    | 47893/80312 [00:04<00:02, 15758.02it/s][A
 62%|██████████████████████████████████████████████████████▉                                  | 49532/80312 [00:04<00:01, 15588.38it/s][A
 64%|████████████████████████████████████████████████████████▉                                | 51409/80312 [00:04<00:01, 16491.01it/s][A
 66%|███████████████████████████████████████████████████████████                              | 53307/80312 [00:04<00:01, 17208.74it/s][A
 69%|█████████████████████████████████████████████████████████████                            | 55056/80312 [00:04<00:01, 17266.80it/s][A
 71%|███████████████████████████████████████████████████████████████                          | 56916/80312 [00:05<00:01, 17657.97it/s][A
 73%|█████████████████████████████████████████████████████████████████▎                       | 58961/80312 [00:05<00:01, 18481.13it/s][A
 76%|██████████████████████

In [22]:
"#triplets ~",len(train_triplets)

('#triplets ~', 1343346)

In [42]:
numtrip_train = []

for i in range(5):
    with open('../dataset/04_K-fold_triplets/train_triplet_idx%d.p'%i,'rb') as f:
        dat = pickle.load(f)
    print("train", i, len(dat))
    
    numtrip_train.append(len(dat))
    
numtrip_test = []

for i in range(5):
    with open('../dataset/04_K-fold_triplets/test_triplet_idx%d.p'%i,'rb') as f:
        dat = pickle.load(f)
    print("test", i, len(dat))
    
    numtrip_test.append(len(dat))    

train 0 1351986
train 1 1355004
train 2 1328074
train 3 1412283
train 4 1343346
test 0 233404
test 1 229331
test 2 267252
test 3 181810
test 4 243191


In [43]:
np.average(numtrip_train), np.average(numtrip_test)

(1358138.6, 230997.6)

## 중복 데이터를 제거 

In [47]:
train_sets = []
test_sets  = []


for i in range(5):    
    
    df_train = pd.read_pickle('../dataset/04_K-fold_dataset/df_train_idx%d.p'%(i))
    df_test  = pd.read_pickle('../dataset/04_K-fold_dataset/df_test_idx%d.p'%(i))
    
    df_train_nodup = df_train[~df_train.duplicated()]
    df_test_nodup  = df_test[~df_test.duplicated()]
    
    df_train_nodup.to_pickle('../dataset/04_K-fold_dataset_nodup/df_train_idx%d.p'%(i))
    df_test_nodup.to_pickle('../dataset/04_K-fold_dataset_nodup/df_test_idx%d.p'%(i))
    
    print(len(df_train)-len(df_train_nodup))
    print(len(df_test)-len(df_test_nodup))

4098
901
4021
978
3592
1407
4350
649
3935
1064


### downstream을 위해 공통 유저만 남긴 데이터 (train test에서) nodup version으로

In [49]:
train_sets = []
test_sets  = []

BASE_PATH = '../dataset/04_K-fold_dataset_nodup/'

for i in range(5):    
    
    df_train = pd.read_pickle(BASE_PATH + 'df_train_idx%d.p'%(i))
    df_test  = pd.read_pickle(BASE_PATH + 'df_test_idx%d.p'%(i))
        
    train_users = df_train.username.unique()
    test_users  = df_test.username.unique()
    
    common_users = []
    for u in test_users:
        if u in train_users:
            common_users.append(u)
    
    df_train_common = df_train[df_train['username'].isin(common_users)]
    df_test_common  = df_test[df_test['username'].isin(common_users)]
    
    
    train_sets.append(df_train_common)
    test_sets.append(df_test_common)
    
    
    print("# votes: Train, Train_common, Test, Test_common")
    print(len(df_train), len(df_train_common), len(df_test), len(df_test_common))
    print("# voters: Train, Train_common, Test, Test_common")
    print(len(df_train.username.unique()), len(df_train_common.username.unique()), len(df_test.username.unique()), len(df_test_common.username.unique()))
    print()

# votes: Train, Train_common, Test, Test_common
153698 111724 38609 33208
# voters: Train, Train_common, Test, Test_common
35447 10173 15006 10173

# votes: Train, Train_common, Test, Test_common
151618 110860 40689 35171
# voters: Train, Train_common, Test, Test_common
35362 10401 15319 10401

# votes: Train, Train_common, Test, Test_common
151524 107163 40783 36060
# voters: Train, Train_common, Test, Test_common
36092 9706 13894 9706

# votes: Train, Train_common, Test, Test_common
157602 113625 34705 27733
# voters: Train, Train_common, Test, Test_common
34205 9675 15750 9675

# votes: Train, Train_common, Test, Test_common
154786 110227 37521 33007
# voters: Train, Train_common, Test, Test_common
36220 9744 13804 9744



In [52]:
BASE_PATH = '../dataset/04_K-fold_dataset_commonusers_nodup/'

for i in range(5):    
    
    train_sets[i].to_pickle(BASE_PATH + 'df_commonuser_train_idx%d.p'%(i))
    test_sets[i].to_pickle(BASE_PATH + 'df_commonuser_test_idx%d.p'%(i))
    

### Original ver

In [55]:
train_sets = []
test_sets  = []

BASE_PATH = '../dataset/04_K-fold_dataset/'

for i in range(5):    
    
    df_train = pd.read_pickle(BASE_PATH + 'df_train_idx%d.p'%(i))
    df_test  = pd.read_pickle(BASE_PATH + 'df_test_idx%d.p'%(i))
        
    train_users = df_train.username.unique()
    test_users  = df_test.username.unique()
    
    common_users = []
    for u in test_users:
        if u in train_users:
            common_users.append(u)
    
    df_train_common = df_train[df_train['username'].isin(common_users)]
    df_test_common  = df_test[df_test['username'].isin(common_users)]
    
    
    train_sets.append(df_train_common)
    test_sets.append(df_test_common)
    
    
    print("# votes: Train, Train_common, Test, Test_common")
    print(len(df_train), len(df_train_common), len(df_test), len(df_test_common))
    print("# voters: Train, Train_common, Test, Test_common")
    print(len(df_train.username.unique()), len(df_train_common.username.unique()), len(df_test.username.unique()), len(df_test_common.username.unique()))
    print()

# votes: Train, Train_common, Test, Test_common
157796 115432 39510 34091
# voters: Train, Train_common, Test, Test_common
35447 10173 15006 10173

# votes: Train, Train_common, Test, Test_common
155639 114565 41667 36126
# voters: Train, Train_common, Test, Test_common
35362 10401 15319 10401

# votes: Train, Train_common, Test, Test_common
155116 110549 42190 37435
# voters: Train, Train_common, Test, Test_common
36092 9706 13894 9706

# votes: Train, Train_common, Test, Test_common
161952 117424 35354 28366
# voters: Train, Train_common, Test, Test_common
34205 9675 15750 9675

# votes: Train, Train_common, Test, Test_common
158721 113819 38585 34057
# voters: Train, Train_common, Test, Test_common
36220 9744 13804 9744



In [58]:
BASE_PATH = '../data/04_K-fold_triplets_commonusers/'

for i in range(5):    
    
    train_sets[i].to_pickle(BASE_PATH + 'df_commonuser_train_idx%d.p'%(i))
    test_sets[i].to_pickle(BASE_PATH + 'df_commonuser_test_idx%d.p'%(i))
    