In [11]:
import numpy as np
import pandas as pd

In [149]:
data = pd.read_csv('../data/raw/ydata-ynacc-v1_0_expert_annotations.tsv', sep='\t')

In [117]:
grouped_data = data.groupby(['commentid'])
def has_label(comment, column, label):
    return np.sum(grouped_data.get_group(comment)[column].apply(lambda x: str(x).find(label) > -1)) > 0

In [118]:
data['y_persuasive'] = data.apply(lambda x: has_label(x['commentid'], 'persuasiveness', 'Persuasive', axis=1)
data['y_audience'] = data.apply(lambda x: has_label(x['commentid'], 'intendedaudience', 'Reply to a specific commenter', axis=1)
data['y_agreement_with_commenter'] = data.apply(lambda x: has_label(x['commentid'], 'commentagreement', 'Agreement with commenter', axis=1)
data['y_informative'] = data.apply(lambda x: has_label(x['commentid'], 'tone', 'Informative', axis=1)
data['y_mean'] = data.apply(lambda x: has_label(x['commentid'], 'tone', 'Mean', axis=1)
data['y_controversial'] = data.apply(lambda x: has_label(x['commentid'], 'tone', 'Controversial', axis=1)
data['y_disagreement_with_commenter'] = data.apply(lambda x: has_label(x['commentid'], 'commentagreement', 'Disagreement with commenter', axis=1)
data['y_off_topic_with_article'] = data.apply(lambda x: has_label(x['commentid'], 'topic', 'Off-topic with article', axis=1)
data['y_sentiment_neutral'] = data.apply(lambda x: has_label(x['commentid'], 'sentiment', 'neutral', axis=1)
data['y_sentiment_positive'] = data.apply(lambda x: has_label(x['commentid'], 'sentiment', 'positive', axis=1)
data['y_sentiment_negative'] = data.apply(lambda x: has_label(x['commentid'], 'sentiment', 'negative', axis=1)
data['y_sentiment_mixed'] = data.apply(lambda x: has_label(x['commentid'], 'sentiment', 'mixed', axis=1)

In [None]:
# data['y_persuasive'] = data['persuasiveness'].str.contains('Persuasive')
# data['y_audience'] = data['intendedaudience'].str.contains('Reply to a specific commenter')
# data['y_agreement_with_commenter'] = data['commentagreement'].str.contains('Agreement with commenter')
# data['y_informative'] = data['tone'].str.contains('Informative')
# data['y_mean'] = data['tone'].str.contains('Mean')
# data['y_controversial'] = data['tone'].str.contains('Controversial')
# data['y_disagreement_with_commenter'] = data['commentagreement'].str.contains('Disagreement with commenter')
# data['y_off_topic_with_article'] = data['topic'].str.contains('Off-topic with article')
# data['y_sentiment_neutral'] = data['sentiment'].str.contains('neutral')
# data['y_sentiment_positive'] = data['sentiment'].str.contains('positive')
# data['y_sentiment_negative'] = data['sentiment'].str.contains('negative')
# data['y_sentiment_mixed'] = data['sentiment'].str.contains('mixed')

In [39]:
tiny = data.drop_duplicates(['y_persuasive', 'y_audience', 'y_agreement_with_commenter', 'y_informative', 'y_mean', 'y_controversial', 'y_disagreement_with_commenter', 'y_off_topic_with_article', 'y_sentiment_neutral', 'y_sentiment_positive', 'y_sentiment_negative', 'y_sentiment_mixed'])

In [72]:
def valid_dataset(df):
    return np.sum(df['y_persuasive']) >= 1 and np.sum(df['y_audience']) >= 1 and np.sum(df['y_agreement_with_commenter']) >= 1 and np.sum(df['y_informative']) >= 1 and np.sum(df['y_mean']) >= 1 and np.sum(df['y_controversial']) >= 1 and np.sum(df['y_disagreement_with_commenter']) >= 1 and np.sum(df['y_off_topic_with_article']) >= 1 and np.sum(df['y_sentiment_neutral']) >= 1 and np.sum(df['y_sentiment_positive']) >= 1 and np.sum(df['y_sentiment_negative']) >= 1 and np.sum(df['y_sentiment_mixed']) >= 1

In [73]:
tmp = tiny.sample(10)
while not valid_dataset(tmp):
    tmp = tiny.sample(10)

In [77]:
tmp = tmp.fillna(False)
tmp.to_csv('../data/datasets/Tiny/train/comments.csv', index=False)
tmp.to_csv('../data/datasets/Tiny/test/comments.csv', index=False)

In [201]:
# save_data = data.fillna(False).drop_duplicates(['commentid'])
save_data.to_csv('../data/datasets/YNACC/train/comments.csv', index=False)
save_data.to_csv('../data/datasets/YNACC/test/comments.csv', index=False)

In [174]:
# save user meta data
users_grouped = save_data.groupby('guid')
user_data = {}
for name, group in users_grouped:
    threads_participated_in = len(np.unique(group['parentid'].apply(lambda x: str(x))))
    user_data[name] = {
        'comment_count': len(group),
        'threads_participated_in': threads_participated_in,
        'threads_initiated': np.sum(group['parentid'] == False),
        'tu_received': np.sum(group['thumbs-up']),
        'td_received': np.sum(group['thumbs-down']),
        'commenting_rate': (np.max(group['timestamp']) - np.min(group['timestamp'])) / threads_participated_in
    }
    
long_data = pd.read_csv('../data/raw/ydata-ynacc-v1_0_unlabeled_conversations.csv', sep=';')
save_long_data = long_data.fillna(False).drop_duplicates(['commentid'])
long_users_grouped = save_long_data.groupby('guid')
for name, group in long_users_grouped:
    threads_participated_in = len(np.unique(group['parentid'].apply(lambda x: str(x))))
    if name in user_data:
        data = user_data[name]
        data['comment_count'] = data['comment_count'] + len(group)
        data['threads_participated_in'] = data['threads_participated_in'] + threads_participated_in
        data['threads_initiated'] = data['threads_initiated'] + np.sum(group['parentid'] == False)
        data['tu_received'] = data['tu_received'] + np.sum(group['thumbs-up'])
        data['td_received'] = data['td_received'] + np.sum(group['thumbs-down'])
        data['commenting_rate'] = (data['commenting_rate'] + (np.max(group['timestamp']) - np.min(group['timestamp'])) / threads_participated_in)/2
        user_data[name] = data

users = pd.DataFrame.from_dict(user_data, orient='index').reset_index()
users.to_csv('../data/datasets/YNACC/users.csv', index=False)

### YNACC Train/Dev/Test Sets

In [202]:
train_ids = np.array(open('../data/raw/ydata-ynacc-v1_0_train-ids.txt', 'r').read().splitlines()).astype('int64')
dev_ids = np.array(open('../data/raw/ydata-ynacc-v1_0_dev-ids.txt', 'r').read().splitlines()).astype('int64')
test_ids = np.array(open('../data/raw/ydata-ynacc-v1_0_test-ids.txt', 'r').read().splitlines()).astype('int64')

In [203]:
save_data[save_data['sdid'].isin(train_ids)].to_csv('../data/datasets/YNACC-Evaluation/train/comments.csv', index=False)
save_data[save_data['sdid'].isin(test_ids)].to_csv('../data/datasets/YNACC-Evaluation/dev/comments.csv', index=False)
save_data[save_data['sdid'].isin(dev_ids)].to_csv('../data/datasets/YNACC-Evaluation/test/comments.csv', index=False)

In [196]:
save_data

Unnamed: 0,sdid,commentindex,headline,url,guid,commentid,timestamp,thumbs-up,thumbs-down,text,...,y_informative,y_mean,y_controversial,y_disagreement_with_commenter,y_off_topic_with_article,y_sentiment,y_sentiment_neutral,y_sentiment_positive,y_sentiment_negative,y_sentiment_mixed
0,53971,2,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56...,rjrPtwH5oVVuQnEXX3hf,00003n000000000000000000000000-ed2ae6d0-32ac-4...,1459917444,0.0,0.0,"These things happen , Every job has its dangers.",...,True,False,True,True,True,False,True,False,True,False
1,53971,0,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56...,VaW6HEsuOFUAIBqjw1k~,1459879464596-a3771c05-fd2e-4f44-a26a-23baec3b...,1459879464,1.0,0.0,Sad to hear such a bad thing. Very dangerous j...,...,False,False,False,False,True,False,True,True,False,True
2,53971,1,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56...,uwQePj970KaMZuW3~9Q9,00002n000000000000000000000000-1c30b878-b717-4...,1459881644,0.0,0.0,Yes..because too many houses in EU look like t...,...,True,False,False,False,True,False,True,False,False,False
3,135929,0,This Old Navy Ad Featuring an Interracial Fami...,http://mic.com/articles/142323/this-old-navy-a...,fixyWJivQjEQtPLLVXsu,1462203719963-3eeffb02-faae-4b51-9174-704c57e6...,1462203719,18.0,3.0,"I am frankly quite SICK of the phrase ""shoved ...",...,False,True,True,False,True,False,False,False,True,False
4,135929,1,This Old Navy Ad Featuring an Interracial Fami...,http://mic.com/articles/142323/this-old-navy-a...,_TDnK715vO5y0OzZz_n4,00002I000000000000000000000000-7ef2ac58-bd84-4...,1462204643,7.0,2.0,"Ya, I always wonder why the conservatives are ...",...,False,False,True,False,True,False,True,False,True,False
5,135929,2,This Old Navy Ad Featuring an Interracial Fami...,http://mic.com/articles/142323/this-old-navy-a...,JzxshZNmXmdh_2kJN2E9,00003b000000000000000000000000-4a5ccbc3-4a6f-4...,1462239616,1.0,0.0,Great comment!,...,False,False,False,False,True,False,True,True,False,False
6,127970,0,"Patrick Brown, Jailed Because He Could Not Aff...",http://mic.com/articles/141141/patrick-brown-j...,mL_LzSXwMfJ21PfZ~yYv,1461069703997-4fe466e4-91e3-4095-a64b-5d89e8a3...,1461069703,10.0,5.0,The result is a system where being poor become...,...,True,False,True,False,False,False,True,False,True,True
7,127970,2,"Patrick Brown, Jailed Because He Could Not Aff...",http://mic.com/articles/141141/patrick-brown-j...,lc5tljQ08YooL7Ep3wT3,00003n000000000000000000000000-f1c6653c-a877-4...,1461070200,0.0,0.0,They are also places where you are supposed no...,...,True,False,True,True,False,False,True,False,True,False
8,127970,1,"Patrick Brown, Jailed Because He Could Not Aff...",http://mic.com/articles/141141/patrick-brown-j...,fuqaPUBVOAnc0o1gGEIg,00002b000000000000000000000000-498bb84f-8c31-4...,1461069989,1.0,1.0,"Stop trying to make sense, it only confuses pe...",...,False,True,False,True,False,False,False,False,True,True
9,127970,3,"Patrick Brown, Jailed Because He Could Not Aff...",http://mic.com/articles/141141/patrick-brown-j...,mL_LzSXwMfJ21PfZ~yYv,00004g000000000000000000000000-1524cf7b-652d-4...,1461079572,0.0,1.0,Madd...sorry - I can't help it. Grim....are yo...,...,False,False,True,True,False,False,True,False,True,True


In [204]:
users

Unnamed: 0,index,comment_count,threads_participated_in,threads_initiated,tu_received,td_received,commenting_rate
0,00eJTUk26BxxhxM1z2y4,2,2,2,54.0,8.0,0.000000e+00
1,00gI9_LBRMb~PGbyNm2r,4,4,1,70.0,11.0,2.890667e+03
2,011_p8EgH1pV~ynXEU~L,6,4,0,11.0,10.0,9.289883e+04
3,01kD2G~JU2pXQVTO_dOU,4,4,0,12.0,8.0,1.493997e+05
4,01q_W5nysBRI24y_C8md,5,5,0,12.0,14.0,1.407762e+05
5,027XL~~IUnCeAGiw0d13,2,2,0,14.0,10.0,0.000000e+00
6,02erf5_p92sdr1NeGE86,5,4,0,9.0,22.0,2.149770e+05
7,03EfXRIBm0alJXFtMVaS,1,1,0,11.0,6.0,0.000000e+00
8,03FsIZ~h_vk0fimv5iXU,4,4,0,6.0,6.0,6.513333e+02
9,03v2vIuXzeHHK4T5PTDf,1,1,0,1.0,0.0,0.000000e+00
