In [1]:
import numpy as np
import pandas as pd

In [2]:
### Read in training dataset (TSV)
# Filepath will be different; set to repo's filepath when we add the corpus files
train_tsv = pd.read_csv('scicite/tsv/train.tsv', sep='\t', 
                       names=["citingPaperID", "source", "string", "label"]
                       )
train_tsv.head()

Unnamed: 0,citingPaperID,source,string,label
0,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,explicit,"However, how frataxin interacts with the Fe-S ...",background
1,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,explicit,"In the study by Hickey et al. (2012), spikes w...",background
2,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,explicit,"The drug also reduces catecholamine secretion,...",background
3,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,explicit,By clustering with lowly aggressive close kin ...,background
4,88b86556857f4374842d2af2e359576806239175>a5bb0...,explicit,Ophthalmic symptoms are rare manifestations of...,background


In [3]:
### Getting samples of the data for sentiment annotating
train_sentiment = train_tsv.copy()

# Chris' subset
c_sample = train_sentiment['string'].sample(n=50, random_state=1)
c_index = list(c_sample.index)

train_sentiment = train_sentiment.drop(c_index)

# Daniel's subset
d_sample = train_sentiment['string'].sample(n=50, random_state=1)
d_index = list(d_sample.index)

train_sentiment = train_sentiment.drop(d_index)

# Mutual subset (for IAA score)
m_sample = train_sentiment['string'].sample(n=25, random_state=1)
m_index = list(m_sample.index)

In [4]:
# Chris' subset in DataFrame form (for adding a sentiment column/feature)
c_df = train_tsv.copy()
c_df = c_df.iloc[c_index]
c_df['sentiment'] = ['']*50
c_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment
2305,f6e7d88342d23cdc6dc959b16106b6bb05fa807a>0fee7...,explicit,"Utilizing gain-offunction assays, we demonstra...",method,
3210,51439b498621806aa3a915fdbd4aa983473da397>6eb72...,explicit,"maticity, refractoriness and conduction of the...",background,
4484,64cf98067d1cadda174e5ffa24fc45a6b3e6426f>2d04a...,explicit,"For reovirus 1/L-induced ARDS, at day 9 postin...",background,
5821,9b6d433ab5104713ee41fbeb24b4d0f2b82fe587>95158...,explicit,"Besides that, no ameliorate impacts have been ...",background,
3514,15ac5bcd9ca900dc948e969558670084e3d560e0>5297c...,explicit,"However, limited information is available abou...",background,


In [None]:
c_df.to_csv('annotation_C.csv')

In [7]:
c_df_annotated = pd.read_csv('annotated_C.csv')
c_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,2305,f6e7d88342d23cdc6dc959b16106b6bb05fa807a>0fee7...,explicit,"Utilizing gain-offunction assays, we demonstra...",method,positive
1,3210,51439b498621806aa3a915fdbd4aa983473da397>6eb72...,explicit,"maticity, refractoriness and conduction of the...",background,neutral
2,4484,64cf98067d1cadda174e5ffa24fc45a6b3e6426f>2d04a...,explicit,"For reovirus 1/L-induced ARDS, at day 9 postin...",background,neutral
3,5821,9b6d433ab5104713ee41fbeb24b4d0f2b82fe587>95158...,explicit,"Besides that, no ameliorate impacts have been ...",background,negative
4,3514,15ac5bcd9ca900dc948e969558670084e3d560e0>5297c...,explicit,"However, limited information is available abou...",background,negative


In [5]:
# Daniel's subset in DataFrame form (for adding a sentiment column/feature)
d_df = train_tsv.copy()
d_df = d_df.iloc[d_index]
d_df['sentiment'] = ['']*50
d_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment
3923,6828a9cc19290ccd8cc99ccc678a042b00d92125>26cad...,explicit,The relationship between DNA content and absor...,background,
4306,5cecd1d9932d3f269b1caa45d07e84e3376c3710>9a0f5...,explicit,With respect to the contradicting results of o...,result,
4960,1890ccd4d2a4d542ba24081b593ab8060e38f101>12732...,explicit,We elected to take this course because formal ...,method,
6921,e2e9ce75aab15707ff3b85378b61f161c0b7886a>39688...,explicit,Nuclear localization of PTOV1 is required for ...,background,
3526,afe94f58975df56a9c0450c8153c2dd43d0b597b>ec694...,explicit,The remaining medial portion of the bone was p...,method,


In [6]:
d_df.to_csv('annotation_D.csv')

In [8]:
d_df_annotated = pd.read_csv('annotated_D.csv')
d_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,3923,6828a9cc19290ccd8cc99ccc678a042b00d92125>26cad...,explicit,The relationship between DNA content and absor...,background,positive
1,4306,5cecd1d9932d3f269b1caa45d07e84e3376c3710>9a0f5...,explicit,With respect to the contradicting results of o...,result,negative
2,4960,1890ccd4d2a4d542ba24081b593ab8060e38f101>12732...,explicit,We elected to take this course because formal ...,method,neutral
3,6921,e2e9ce75aab15707ff3b85378b61f161c0b7886a>39688...,explicit,Nuclear localization of PTOV1 is required for ...,background,neutral
4,3526,afe94f58975df56a9c0450c8153c2dd43d0b597b>ec694...,explicit,The remaining medial portion of the bone was p...,method,neutral


In [9]:
# Mutual subset in DataFrame form (for adding IAA sentiment annotations)
m_df = train_tsv.copy()
m_df = m_df.iloc[m_index]
m_df['sentiment_chris'] = ['']*25
m_df['sentiment_daniel'] = ['']*25
m_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,,
3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,,
1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,,
7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,,
6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,,


In [9]:
m_df_annotated = pd.read_csv('annotated_m.csv')
m_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral
