In [1]:
import numpy as np
import pandas as pd

In [2]:
### Read in training dataset (TSV)
# Filepath will be different; set to repo's filepath when we add the corpus files
train_tsv = pd.read_csv('scicite/tsv/train.tsv', sep='\t', 
                       names=["citingPaperID", "source", "string", "label"]
                       )
train_tsv.head()

Unnamed: 0,citingPaperID,source,string,label
0,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,explicit,"However, how frataxin interacts with the Fe-S ...",background
1,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,explicit,"In the study by Hickey et al. (2012), spikes w...",background
2,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,explicit,"The drug also reduces catecholamine secretion,...",background
3,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,explicit,By clustering with lowly aggressive close kin ...,background
4,88b86556857f4374842d2af2e359576806239175>a5bb0...,explicit,Ophthalmic symptoms are rare manifestations of...,background


# First set of sentiment annotations (Week of Jun. 27, 2022)

In [38]:
### Getting samples of the data for sentiment annotating
train_sentiment = train_tsv.copy()

# Chris' subset
c_sample = list(train_sentiment['string'].sample(n=50, random_state=1).index)
# Drop
train_sentiment = train_sentiment.drop(c_sample)

# Daniel's subset
d_sample = list(train_sentiment['string'].sample(n=50, random_state=1).index)
# Drop
train_sentiment = train_sentiment.drop(d_sample)

# Mutual subset (for IAA score)
m_sample = list(train_sentiment['string'].sample(n=25, random_state=1).index)
train_sentiment = train_sentiment.drop(m_sample)

In [41]:
# Chris' subset in DataFrame form (for adding a sentiment column/feature)
c_df = train_tsv.copy()
c_df = c_df.iloc[c_sample]
c_df['sentiment'] = ['']*50
c_df.to_csv('sentiment_annotations_CSV/annotations_blank_C.csv')
c_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment
2305,f6e7d88342d23cdc6dc959b16106b6bb05fa807a>0fee7...,explicit,"Utilizing gain-offunction assays, we demonstra...",method,
3210,51439b498621806aa3a915fdbd4aa983473da397>6eb72...,explicit,"maticity, refractoriness and conduction of the...",background,
4484,64cf98067d1cadda174e5ffa24fc45a6b3e6426f>2d04a...,explicit,"For reovirus 1/L-induced ARDS, at day 9 postin...",background,
5821,9b6d433ab5104713ee41fbeb24b4d0f2b82fe587>95158...,explicit,"Besides that, no ameliorate impacts have been ...",background,
3514,15ac5bcd9ca900dc948e969558670084e3d560e0>5297c...,explicit,"However, limited information is available abou...",background,


In [5]:
# Reading in Chris' subset
c_df_annotated = pd.read_csv('annotated_C.csv')
c_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,2305,f6e7d88342d23cdc6dc959b16106b6bb05fa807a>0fee7...,explicit,"Utilizing gain-offunction assays, we demonstra...",method,positive
1,3210,51439b498621806aa3a915fdbd4aa983473da397>6eb72...,explicit,"maticity, refractoriness and conduction of the...",background,neutral
2,4484,64cf98067d1cadda174e5ffa24fc45a6b3e6426f>2d04a...,explicit,"For reovirus 1/L-induced ARDS, at day 9 postin...",background,neutral
3,5821,9b6d433ab5104713ee41fbeb24b4d0f2b82fe587>95158...,explicit,"Besides that, no ameliorate impacts have been ...",background,negative
4,3514,15ac5bcd9ca900dc948e969558670084e3d560e0>5297c...,explicit,"However, limited information is available abou...",background,negative


In [6]:
# Daniel's subset in DataFrame form (for adding a sentiment column/feature)
d_df = train_tsv.copy()
d_df = d_df.iloc[d_index]
d_df['sentiment'] = ['']*50
d_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment
3923,6828a9cc19290ccd8cc99ccc678a042b00d92125>26cad...,explicit,The relationship between DNA content and absor...,background,
4306,5cecd1d9932d3f269b1caa45d07e84e3376c3710>9a0f5...,explicit,With respect to the contradicting results of o...,result,
4960,1890ccd4d2a4d542ba24081b593ab8060e38f101>12732...,explicit,We elected to take this course because formal ...,method,
6921,e2e9ce75aab15707ff3b85378b61f161c0b7886a>39688...,explicit,Nuclear localization of PTOV1 is required for ...,background,
3526,afe94f58975df56a9c0450c8153c2dd43d0b597b>ec694...,explicit,The remaining medial portion of the bone was p...,method,


In [7]:
d_df_annotated = pd.read_csv('annotated_D.csv')
d_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,3923,6828a9cc19290ccd8cc99ccc678a042b00d92125>26cad...,explicit,The relationship between DNA content and absor...,background,positive
1,4306,5cecd1d9932d3f269b1caa45d07e84e3376c3710>9a0f5...,explicit,With respect to the contradicting results of o...,result,negative
2,4960,1890ccd4d2a4d542ba24081b593ab8060e38f101>12732...,explicit,We elected to take this course because formal ...,method,neutral
3,6921,e2e9ce75aab15707ff3b85378b61f161c0b7886a>39688...,explicit,Nuclear localization of PTOV1 is required for ...,background,neutral
4,3526,afe94f58975df56a9c0450c8153c2dd43d0b597b>ec694...,explicit,The remaining medial portion of the bone was p...,method,neutral


In [8]:
# Mutual subset in DataFrame form (for adding IAA sentiment annotations)
m_df = train_tsv.copy()
m_df = m_df.iloc[m_index]
m_df['sentiment_chris'] = ['']*25
m_df['sentiment_daniel'] = ['']*25
m_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,,
3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,,
1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,,
7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,,
6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,,


In [9]:
m_df_annotated = pd.read_csv('annotated_m.csv')
m_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral


# Second set of sentiment annotations (Week of Jul. 4, 2022)

In [33]:
### Second set of random samples for sentiment annotations
train_sentiment_2 = train_sentiment.copy()

# Chris' subset
c_sample = train_sentiment_2['string'].sample(n=100, random_state=1)
c_index_2 = list(c_sample.index)
# Drop
train_sentiment_2 = train_sentiment_2.drop(c_index_2)

# Daniel's subset
d_sample = train_sentiment_2['string'].sample(n=200, random_state=1)
d_index_2 = list(d_sample.index)
# Drop
train_sentiment_2 = train_sentiment_2.drop(d_index_2)

# Mutual subset (for IAA score)
m_sample = train_sentiment_2['string'].sample(n=25, random_state=1)
m_index = list(m_sample.index)
train_sentiment_2 = train_sentiment_2.drop(m_index)

# Applying sentiment classifiers

In [3]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/lolai/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
sid = SentimentIntensityAnalyzer()

In [23]:
m_df_annotated['pos'],m_df_annotated['neg'],m_df_annotated['neu'] = 0,0,0

In [30]:
a=m_df_annotated['string'][0]

In [8]:
output = sid.polarity_scores("Hello there")

In [9]:
m_df_annotated['sc ores'] = m_df_annotated['string'].apply(lambda string: sid.polarity_scores(string))

In [10]:
m_df_annotated

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel,sc ores
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral,"{'neg': 0.176, 'neu': 0.621, 'pos': 0.203, 'co..."
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral,"{'neg': 0.0, 'neu': 0.938, 'pos': 0.062, 'comp..."
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral,"{'neg': 0.0, 'neu': 0.931, 'pos': 0.069, 'comp..."
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp..."
5,2543,2c0cab0d657f6c225c74736fed02750c381032de>02cdb...,explicit,These results are similar to other reports [34...,result,positive,positive,"{'neg': 0.059, 'neu': 0.88, 'pos': 0.062, 'com..."
6,8093,7dda92d7dbb1a5f7fc78bba6f81890b3832cb2d6>e6f6f...,explicit,A type I T-cell gene signature that promotes b...,background,positive,positive,"{'neg': 0.131, 'neu': 0.73, 'pos': 0.139, 'com..."
7,5365,9237db18530d0de2e0281e735c10570d7c235b36>14682...,explicit,"Resistance to TB involves macrophages [2,4], d...",background,positive,positive,"{'neg': 0.0, 'neu': 0.948, 'pos': 0.052, 'comp..."
8,4663,1ac9706400e1c06a100add4b83a67af503404d8d>1fc59...,explicit,While the prevalence of hyperechogenicity is o...,background,positive,neutral,"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou..."
9,3065,4f4c6a035c0c28cfe5855a720a8f7a16a401e9f8>0451f...,explicit,There have been few animal studies on this sub...,background,negative,negative,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [17]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/lolai/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/lolai/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [23]:
for index, row in m_df_annotated.iterrows():
    blob = TextBlob(row['string'], analyzer=NaiveBayesAnalyzer())
    m_df_annotated.at[index, 'blob'] = blob.sentiment[1]

In [27]:
m_df_annotated['blob'] = m_df_annotated['string'].apply(lambda string: TextBlob(string).polarity)

In [28]:
m_df_annotated

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel,sc ores,blob
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral,"{'neg': 0.176, 'neu': 0.621, 'pos': 0.203, 'co...",-0.036364
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral,"{'neg': 0.0, 'neu': 0.938, 'pos': 0.062, 'comp...",0.1
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral,"{'neg': 0.0, 'neu': 0.931, 'pos': 0.069, 'comp...",-0.051852
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp...",-0.144345
5,2543,2c0cab0d657f6c225c74736fed02750c381032de>02cdb...,explicit,These results are similar to other reports [34...,result,positive,positive,"{'neg': 0.059, 'neu': 0.88, 'pos': 0.062, 'com...",-0.0625
6,8093,7dda92d7dbb1a5f7fc78bba6f81890b3832cb2d6>e6f6f...,explicit,A type I T-cell gene signature that promotes b...,background,positive,positive,"{'neg': 0.131, 'neu': 0.73, 'pos': 0.139, 'com...",0.0
7,5365,9237db18530d0de2e0281e735c10570d7c235b36>14682...,explicit,"Resistance to TB involves macrophages [2,4], d...",background,positive,positive,"{'neg': 0.0, 'neu': 0.948, 'pos': 0.052, 'comp...",0.0
8,4663,1ac9706400e1c06a100add4b83a67af503404d8d>1fc59...,explicit,While the prevalence of hyperechogenicity is o...,background,positive,neutral,"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou...",0.34375
9,3065,4f4c6a035c0c28cfe5855a720a8f7a16a401e9f8>0451f...,explicit,There have been few animal studies on this sub...,background,negative,negative,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",-0.183333
