In [1]:
import numpy as np
import optuna as optuna
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
### Read in training dataset (TSV)
# Filepath will be different; set to repo's filepath when we add the corpus files
train_tsv = pd.read_csv('scicite/tsv/train.tsv', sep='\t', 
                       names=["citingPaperID", "source", "string", "label"]
                       )
train_tsv.head()

Unnamed: 0,citingPaperID,source,string,label
0,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,explicit,"However, how frataxin interacts with the Fe-S ...",background
1,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,explicit,"In the study by Hickey et al. (2012), spikes w...",background
2,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,explicit,"The drug also reduces catecholamine secretion,...",background
3,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,explicit,By clustering with lowly aggressive close kin ...,background
4,88b86556857f4374842d2af2e359576806239175>a5bb0...,explicit,Ophthalmic symptoms are rare manifestations of...,background


# First set of sentiment annotations (Week of Jun. 27, 2022)

In [3]:
### Getting samples of the data for sentiment annotating
train_sentiment = train_tsv.copy()

# Chris' subset
c_sample = list(train_sentiment['string'].sample(n=50, random_state=1).index)
# Drop
train_sentiment = train_sentiment.drop(c_sample)

# Daniel's subset
d_sample = list(train_sentiment['string'].sample(n=50, random_state=1).index)
# Drop
train_sentiment = train_sentiment.drop(d_sample)

# Mutual subset (for IAA score)
m_sample = list(train_sentiment['string'].sample(n=25, random_state=1).index)
train_sentiment = train_sentiment.drop(m_sample)

In [4]:
# Chris' subset in DataFrame form (for adding a sentiment column/feature)
c_df = train_tsv.copy()
c_df = c_df.iloc[c_sample]
c_df['sentiment'] = ['']*50
c_df.to_csv('sentimentAnnotations_CSV/blankAnnotations_C_1.csv')
c_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment
2305,f6e7d88342d23cdc6dc959b16106b6bb05fa807a>0fee7...,explicit,"Utilizing gain-offunction assays, we demonstra...",method,
3210,51439b498621806aa3a915fdbd4aa983473da397>6eb72...,explicit,"maticity, refractoriness and conduction of the...",background,
4484,64cf98067d1cadda174e5ffa24fc45a6b3e6426f>2d04a...,explicit,"For reovirus 1/L-induced ARDS, at day 9 postin...",background,
5821,9b6d433ab5104713ee41fbeb24b4d0f2b82fe587>95158...,explicit,"Besides that, no ameliorate impacts have been ...",background,
3514,15ac5bcd9ca900dc948e969558670084e3d560e0>5297c...,explicit,"However, limited information is available abou...",background,


In [5]:
# Reading in Chris' subset (after annotating manually)
c_df_annotated = pd.read_csv('sentimentAnnotations_CSV/annotated_C_1.csv')
c_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,2305,f6e7d88342d23cdc6dc959b16106b6bb05fa807a>0fee7...,explicit,"Utilizing gain-offunction assays, we demonstra...",method,positive
1,3210,51439b498621806aa3a915fdbd4aa983473da397>6eb72...,explicit,"maticity, refractoriness and conduction of the...",background,neutral
2,4484,64cf98067d1cadda174e5ffa24fc45a6b3e6426f>2d04a...,explicit,"For reovirus 1/L-induced ARDS, at day 9 postin...",background,neutral
3,5821,9b6d433ab5104713ee41fbeb24b4d0f2b82fe587>95158...,explicit,"Besides that, no ameliorate impacts have been ...",background,negative
4,3514,15ac5bcd9ca900dc948e969558670084e3d560e0>5297c...,explicit,"However, limited information is available abou...",background,negative


In [6]:
# Daniel's subset in DataFrame form (for adding a sentiment column/feature)
d_df = train_tsv.copy()
d_df = d_df.iloc[d_sample]
d_df['sentiment'] = ['']*50
d_df.to_csv('sentimentAnnotations_CSV/blankAnnotations_D_1.csv')
d_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment
3923,6828a9cc19290ccd8cc99ccc678a042b00d92125>26cad...,explicit,The relationship between DNA content and absor...,background,
4306,5cecd1d9932d3f269b1caa45d07e84e3376c3710>9a0f5...,explicit,With respect to the contradicting results of o...,result,
4960,1890ccd4d2a4d542ba24081b593ab8060e38f101>12732...,explicit,We elected to take this course because formal ...,method,
6921,e2e9ce75aab15707ff3b85378b61f161c0b7886a>39688...,explicit,Nuclear localization of PTOV1 is required for ...,background,
3526,afe94f58975df56a9c0450c8153c2dd43d0b597b>ec694...,explicit,The remaining medial portion of the bone was p...,method,


In [7]:
# Reading in Daniel's subset (after annotating manually)
d_df_annotated = pd.read_csv('sentimentAnnotations_CSV/annotated_D_1.csv')
d_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,3923,6828a9cc19290ccd8cc99ccc678a042b00d92125>26cad...,explicit,The relationship between DNA content and absor...,background,positive
1,4306,5cecd1d9932d3f269b1caa45d07e84e3376c3710>9a0f5...,explicit,With respect to the contradicting results of o...,result,negative
2,4960,1890ccd4d2a4d542ba24081b593ab8060e38f101>12732...,explicit,We elected to take this course because formal ...,method,neutral
3,6921,e2e9ce75aab15707ff3b85378b61f161c0b7886a>39688...,explicit,Nuclear localization of PTOV1 is required for ...,background,neutral
4,3526,afe94f58975df56a9c0450c8153c2dd43d0b597b>ec694...,explicit,The remaining medial portion of the bone was p...,method,neutral


In [8]:
# Mutual subset in DataFrame form (for adding IAA sentiment annotations)
m_df = train_tsv.copy()
m_df = m_df.iloc[m_sample]
m_df['sentiment_chris'] = ['']*25
m_df['sentiment_daniel'] = ['']*25
m_df.to_csv('sentimentAnnotations_CSV/blankAnnotations_M_1.csv')
m_df.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,,
3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,,
1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,,
7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,,
6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,,


In [9]:
# Reading in first mutual annotations subset
m_df_annotated = pd.read_csv('sentimentAnnotations_CSV/annotated_M_1.csv')
m_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral


# Second set of sentiment annotations (Week of Jul. 4, 2022)

In [10]:
### Second sets of random samples for sentiment annotations
train_sentiment_2 = train_sentiment.copy()

# Chris' second subset
c_sample_2 = list(train_sentiment_2['string'].sample(n=100, random_state=1).index)
# Drop
train_sentiment_2 = train_sentiment_2.drop(c_sample_2)

# Daniel's second subset
d_sample_2 = list(train_sentiment_2['string'].sample(n=200, random_state=1).index)
# Drop
train_sentiment_2 = train_sentiment_2.drop(d_sample_2)

# Second mutual subset (for IAA score)
m_sample_2 = list(train_sentiment_2['string'].sample(n=25, random_state=1).index)
train_sentiment_2 = train_sentiment_2.drop(m_sample_2)

In [11]:
# Chris' second subset in DataFrame form (for adding a sentiment column/feature)
c_df_2 = train_tsv.copy()
c_df_2 = c_df_2.iloc[c_sample_2]
c_df_2['sentiment'] = ['']*100
c_df_2.to_csv('sentimentAnnotations_CSV/blankAnnotations_C_2.csv')
c_df_2.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment
3344,5fa7cd8dfc1907789705613dda8328f7f6ecfc53>ad9a2...,properNoun,Cases were identified and clinical data collec...,method,
5322,410376c6a35d3dd09a544e0b334ac4e11e22da52>bc09d...,explicit,"[5,6,13,14] The abdominal superficial venous d...",background,
5387,c20db8a8cd0649dd3644a897404c8d5100b7c707>76ae7...,explicit,be involved in protein binding (especially of ...,background,
1793,e300bde47ff38b78bd09fc3566e9c3b5a0aa21e5>6cd19...,explicit,We began the analysis with open coding that in...,method,
776,ca4d2aef7421153f0f2c93db54a3e06a4942d329>a0ecd...,explicit,Figure 3 shows the fit of the model to the dyn...,background,


In [12]:
# Reading in Chris' second subset (after annotating manually)
    ###Uncomment lines below after creating your second annotated subset
c_df_annotated_2 = pd.read_csv('sentimentAnnotations_CSV/annotated_C_2.csv')
c_df_annotated_2.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,3344,5fa7cd8dfc1907789705613dda8328f7f6ecfc53>ad9a2...,properNoun,Cases were identified and clinical data collec...,method,neutral
1,5322,410376c6a35d3dd09a544e0b334ac4e11e22da52>bc09d...,explicit,"[5,6,13,14] The abdominal superficial venous d...",background,positive
2,5387,c20db8a8cd0649dd3644a897404c8d5100b7c707>76ae7...,explicit,be involved in protein binding (especially of ...,background,neutral
3,1793,e300bde47ff38b78bd09fc3566e9c3b5a0aa21e5>6cd19...,explicit,We began the analysis with open coding that in...,method,neutral
4,776,ca4d2aef7421153f0f2c93db54a3e06a4942d329>a0ecd...,explicit,Figure 3 shows the fit of the model to the dyn...,background,neutral


In [13]:
# Daniel's second subset in DataFrame form (for adding a sentiment column/feature)
d_df_2 = train_tsv.copy()
d_df_2 = d_df_2.iloc[d_sample_2]
d_df_2['sentiment'] = ['']*200
d_df_2.to_csv('sentimentAnnotations_CSV/blankAnnotations_D_2.csv')
d_df_2.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment
108,44b5a8fde9f6c45652d642ac9d27ecb12d755ce7>a790a...,properNoun,Among possible explanations is the contributio...,result,
3287,0e4bf03c0e7a18078edc742edf2f356c8b29aace>ddd33...,explicit,The 21 RCTs [15-35] and 42 observational studi...,background,
8116,2f82dfa009bbf2a400908101438bf9a18a87a166>f3e01...,explicit,The change in the apparent diameter of the par...,method,
77,27db8d3a2c85cbc34deef51ce4cd850b431b4b34>a81b3...,explicit,The potential for BCL-xL to modulate other cha...,background,
2976,b14d9802f95d696cb7f976988cf874e8f1273749>a4cb5...,explicit,Antidepressant-like effects were measured usin...,method,


In [14]:
# Reading in Daniel's second subset (after annotating manually)
d_df_annotated_2 = pd.read_csv('sentimentAnnotations_CSV/annotated_D_2.csv')
d_df_annotated_2.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,108,44b5a8fde9f6c45652d642ac9d27ecb12d755ce7>a790a...,properNoun,Among possible explanations is the contributio...,result,neutral
1,3287,0e4bf03c0e7a18078edc742edf2f356c8b29aace>ddd33...,explicit,The 21 RCTs [15-35] and 42 observational studi...,background,neutral
2,8116,2f82dfa009bbf2a400908101438bf9a18a87a166>f3e01...,explicit,The change in the apparent diameter of the par...,method,neutral
3,77,27db8d3a2c85cbc34deef51ce4cd850b431b4b34>a81b3...,explicit,The potential for BCL-xL to modulate other cha...,background,neutral
4,2976,b14d9802f95d696cb7f976988cf874e8f1273749>a4cb5...,explicit,Antidepressant-like effects were measured usin...,method,neutral


In [15]:
# Second mutual subset in DataFrame form (for adding IAA sentiment annotations)
m_df_2 = train_tsv.copy()
m_df_2 = m_df_2.iloc[m_sample_2]
m_df_2['sentiment_chris'] = ['']*25
m_df_2['sentiment_daniel'] = ['']*25
m_df_2.to_csv('sentimentAnnotations_CSV/blankAnnotations_M_2.csv')
m_df_2.head()

Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
7422,cf627c9411e3bf3e67f23df5d5741ed48490a152>3d2c9...,explicit,"Upon initial pulmonary infection, a subset of ...",background,,
5404,29cd81b4d95c3c99d2d27629e2acf0f5e3c2ec17>54996...,explicit,"The model (Rohrer and Berresheim, 2006; Hofzum...",method,,
6247,ed60fe5cf0b2947f812a3d19d533d0f644b67b0d>5af4e...,explicit,"In the discussion of their paper, Hendriks et ...",background,,
42,ac5fb99eda6b95e9703bb3d93417df5f7373e662>e9891...,explicit,A serotype-shift occurred over the years with ...,background,,
313,90e3135b8b3ae3871c2d272a5971b703174bcd96>466e2...,explicit,"To date, these reactions have primarily been i...",background,,


In [16]:
# Reading in second mutual annotations subset
m_df_annotated_2 = pd.read_csv('sentimentAnnotations_CSV/annotated_M_2.csv')
m_df_annotated_2.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
0,7422,cf627c9411e3bf3e67f23df5d5741ed48490a152>3d2c9...,explicit,"Upon initial pulmonary infection, a subset of ...",background,neutral,neutral
1,5404,29cd81b4d95c3c99d2d27629e2acf0f5e3c2ec17>54996...,explicit,"The model (Rohrer and Berresheim, 2006; Hofzum...",method,neutral,positive
2,6247,ed60fe5cf0b2947f812a3d19d533d0f644b67b0d>5af4e...,explicit,"In the discussion of their paper, Hendriks et ...",background,positive,neutral
3,42,ac5fb99eda6b95e9703bb3d93417df5f7373e662>e9891...,explicit,A serotype-shift occurred over the years with ...,background,neutral,neutral
4,313,90e3135b8b3ae3871c2d272a5971b703174bcd96>466e2...,explicit,"To date, these reactions have primarily been i...",background,neutral,neutral


In [30]:
# More negative data
negative = pd.read_csv('negative.csv')
train_sentiment = train_sentiment.drop(list(negative.iloc[:, 0]))
len(train_sentiment)

8097

# Joining annotated data together

In [17]:
#annotated = pd.concat([c_df_annotated, d_df_annotated])
    ### Comment above line and uncomment lines below after all annotations have been completed
annotated = pd.concat([c_df_annotated, d_df_annotated, c_df_annotated_2, d_df_annotated_2, negative])
annotated_mutual = pd.concat([m_df_annotated, m_df_annotated_2])

In [12]:
annotated.to_csv('sentimentAnnotations_CSV/combined_annotation.csv')

In [13]:
annotated_mutual.to_csv("sentimentAnnotations_CSV/combined_annotation_mutual.csv")

In [None]:
# Calculating IAA score (Cohen's kappa) for first session
mutual_chris = list(m_df_annotated['sentiment_chris'])
mutual_daniel = list(m_df_annotated['sentiment_daniel'])

p_o = 0
for i in np.arange(25):
    if mutual_chris[i] == mutual_daniel[i]:
        p_o += 1
p_o /= 25

p_e = 0
for value in ['positive', 'neutral', 'negative']:
    p_sentiment = (mutual_chris.count(value)/25) * (mutual_daniel.count(value)/25)
    p_e += p_sentiment

kappa = (p_o - p_e) / (1 - p_e)
kappa

In [None]:
# Calculating IAA score (Cohen's kappa) after second session
mutual_chris = list(annotated_mutual['sentiment_chris'])
mutual_daniel = list(annotated_mutual['sentiment_daniel'])

p_o = 0
for i in np.arange(50):
    if mutual_chris[i] == mutual_daniel[i]:
        p_o += 1
p_o /= 50

p_e = 0
for value in ['positive', 'neutral', 'negative']:
    p_sentiment = (mutual_chris.count(value)/50) * (mutual_daniel.count(value)/50)
    p_e += p_sentiment

kappa_final = (p_o - p_e) / (1 - p_e)
kappa_final

# Applying sentiment classifiers

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
m_df_annotated['pos'],m_df_annotated['neg'],m_df_annotated['neu'] = 0,0,0

In [None]:
a=m_df_annotated['string'][0]

In [None]:
output = sid.polarity_scores("Hello there")

In [None]:
# Calculating final IAA score (Cohen's kappa)
mutual_chris = list(annotated_mutual['sentiment_chris'])
mutual_daniel = list(annotated_mutual['sentiment_daniel'])

p_o = 0
for i in np.arange(50):
    if mutual_chris[i] == mutual_daniel[i]:
        p_o += 1
p_o /= 50

p_e = 0
for value in ['positive', 'neutral', 'negative']:
    p_sentiment = (mutual_chris.count(value)/50) * (mutual_daniel.count(value)/50)
    p_e += p_sentiment

kappa_final = (p_o - p_e) / (1 - p_e)
kappa_final

In [26]:
# Calculating final IAA score (Cohen's kappa)
mutual_chris = list(annotated_mutual['sentiment_chris'])
mutual_daniel = list(annotated_mutual['sentiment_daniel'])

p_o = 0
for i in np.arange(50):
    if mutual_chris[i] == mutual_daniel[i]:
        p_o += 1
p_o /= 50

p_e = 0
for value in ['positive', 'neutral', 'negative']:
    p_sentiment = (mutual_chris.count(value)/50) * (mutual_daniel.count(value)/50)
    p_e += p_sentiment

kappa_final = (p_o - p_e) / (1 - p_e)
kappa_final

0.7330960854092526

# Applying sentiment classifiers

In [3]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/lolai/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
sid = SentimentIntensityAnalyzer()

In [23]:
m_df_annotated['pos'],m_df_annotated['neg'],m_df_annotated['neu'] = 0,0,0

In [30]:
a=m_df_annotated['string'][0]

In [8]:
output = sid.polarity_scores("Hello there")

In [9]:
m_df_annotated['sc ores'] = m_df_annotated['string'].apply(lambda string: sid.polarity_scores(string))

In [10]:
m_df_annotated

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel,sc ores
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral,"{'neg': 0.176, 'neu': 0.621, 'pos': 0.203, 'co..."
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral,"{'neg': 0.0, 'neu': 0.938, 'pos': 0.062, 'comp..."
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral,"{'neg': 0.0, 'neu': 0.931, 'pos': 0.069, 'comp..."
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp..."
5,2543,2c0cab0d657f6c225c74736fed02750c381032de>02cdb...,explicit,These results are similar to other reports [34...,result,positive,positive,"{'neg': 0.059, 'neu': 0.88, 'pos': 0.062, 'com..."
6,8093,7dda92d7dbb1a5f7fc78bba6f81890b3832cb2d6>e6f6f...,explicit,A type I T-cell gene signature that promotes b...,background,positive,positive,"{'neg': 0.131, 'neu': 0.73, 'pos': 0.139, 'com..."
7,5365,9237db18530d0de2e0281e735c10570d7c235b36>14682...,explicit,"Resistance to TB involves macrophages [2,4], d...",background,positive,positive,"{'neg': 0.0, 'neu': 0.948, 'pos': 0.052, 'comp..."
8,4663,1ac9706400e1c06a100add4b83a67af503404d8d>1fc59...,explicit,While the prevalence of hyperechogenicity is o...,background,positive,neutral,"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou..."
9,3065,4f4c6a035c0c28cfe5855a720a8f7a16a401e9f8>0451f...,explicit,There have been few animal studies on this sub...,background,negative,negative,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [17]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/lolai/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/lolai/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [23]:
for index, row in m_df_annotated.iterrows():
    blob = TextBlob(row['string'], analyzer=NaiveBayesAnalyzer())
    m_df_annotated.at[index, 'blob'] = blob.sentiment[1]

In [27]:
m_df_annotated['blob'] = m_df_annotated['string'].apply(lambda string: TextBlob(string).polarity)

In [28]:
m_df_annotated

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel,sc ores,blob
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral,"{'neg': 0.176, 'neu': 0.621, 'pos': 0.203, 'co...",-0.036364
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral,"{'neg': 0.0, 'neu': 0.938, 'pos': 0.062, 'comp...",0.1
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral,"{'neg': 0.0, 'neu': 0.931, 'pos': 0.069, 'comp...",-0.051852
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp...",-0.144345
5,2543,2c0cab0d657f6c225c74736fed02750c381032de>02cdb...,explicit,These results are similar to other reports [34...,result,positive,positive,"{'neg': 0.059, 'neu': 0.88, 'pos': 0.062, 'com...",-0.0625
6,8093,7dda92d7dbb1a5f7fc78bba6f81890b3832cb2d6>e6f6f...,explicit,A type I T-cell gene signature that promotes b...,background,positive,positive,"{'neg': 0.131, 'neu': 0.73, 'pos': 0.139, 'com...",0.0
7,5365,9237db18530d0de2e0281e735c10570d7c235b36>14682...,explicit,"Resistance to TB involves macrophages [2,4], d...",background,positive,positive,"{'neg': 0.0, 'neu': 0.948, 'pos': 0.052, 'comp...",0.0
8,4663,1ac9706400e1c06a100add4b83a67af503404d8d>1fc59...,explicit,While the prevalence of hyperechogenicity is o...,background,positive,neutral,"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou...",0.34375
9,3065,4f4c6a035c0c28cfe5855a720a8f7a16a401e9f8>0451f...,explicit,There have been few animal studies on this sub...,background,negative,negative,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",-0.183333


# Train sentiment classifier on annotated data BERT

In [11]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DataCollatorWithPadding
import datasets

In [2]:
imdb = datasets.load_dataset("imdb")
imdb

Reusing dataset imdb (/home/lolai/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [3]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])


Loading cached shuffled indices for dataset at /home/lolai/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-8a9e43a6ac4acdff.arrow
Loading cached shuffled indices for dataset at /home/lolai/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-2eff9f118d84c6fe.arrow


In [53]:
bert_pd = pd.DataFrame()
bert_pd['text'], bert_pd['label'] = annotated['string'], annotated['sentiment']
bert_pd

Unnamed: 0,text,label
0,"Utilizing gain-offunction assays, we demonstra...",positive
1,"maticity, refractoriness and conduction of the...",neutral
2,"For reovirus 1/L-induced ARDS, at day 9 postin...",neutral
3,"Besides that, no ameliorate impacts have been ...",negative
4,"However, limited information is available abou...",negative
...,...,...
45,A recent review by Kazan et al depicted curren...,positive
46,"…is variation among ecosystems, the most commo...",positive
47,"Thirdly, ErbB-3 is characterized by a defectiv...",neutral
48,"These clusters of informative voxels, along wi...",positive


In [54]:
bert_pd_test = pd.DataFrame()
bert_pd_test['text'], bert_pd_test['label'] = m_df_annotated['string'], m_df_annotated['sentiment_chris']
bert_pd_test

Unnamed: 0,text,label
0,The self-report component measures interpretat...,neutral
1,The age of giant nupDNA fragment A was calcula...,neutral
2,Two representative software of this type are s...,neutral
3,How are Bcl-2 and Mcl-1 levels regulated in a ...,neutral
4,"Two traits are orthogonal when, based on indiv...",neutral
5,These results are similar to other reports [34...,positive
6,A type I T-cell gene signature that promotes b...,positive
7,"Resistance to TB involves macrophages [2,4], d...",positive
8,While the prevalence of hyperechogenicity is o...,positive
9,There have been few animal studies on this sub...,negative


In [57]:
ds_train = datasets.Dataset.from_pandas(bert_pd)
ds_train = ds_train.remove_columns('__index_level_0__')
ds_train

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})

In [58]:
ds_test = datasets.Dataset.from_pandas(bert_pd_test)
ds_test

Dataset({
    features: ['text', 'label'],
    num_rows: 25
})

In [74]:
ds_train['text']

['Utilizing gain-offunction assays, we demonstrate that enhancing actin polymer-\\nthese proteins does not increase SV density (Scheiffele et al., 2000; Bamji et al., 2003; Sara et al., 2005; Latefi et al., 2009; Linhoff et al., 2009).',
 'maticity, refractoriness and conduction of the heart are conserved between zebrafish and humans [44, 46].',
 'For reovirus 1/L-induced ARDS, at day 9 postinoculation, there is a severe pneumonia (peribronchiolar lesions with lymphocytic infiltration) with the presence of hyaline membranes, which are pathognomonic for human ARDS (20, 21).',
 'Besides that, no ameliorate impacts have been obtained on the use of antioxidants in some studies (12,13).',
 'However, limited information is available about molecular mechanisms of HM tolerance and detoxification in trees, and most of HM related genes in woody plants are restricted to Populus (Luo et al., 2016).',
 'A recent qualitative study explored the views of individuals living with SCI towards the descrip

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True, padding=True)

"""tokenized_train = ds_train.map(preprocess_function, batched=True,
                               #remove_columns=ds_train.column_names
                               )
tokenized_test = ds_test.map(preprocess_function, batched=True,
                             #remove_columns=ds_test.column_names
                             )"""
tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [83]:
tokenized_train

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 3000
})

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [7]:
import numpy as np
from datasets import load_metric, ClassLabel


def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [11]:
training_args = TrainingArguments(
   output_dir="bert_test",
   learning_rate=2e-5,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=8,
   num_train_epochs=5,
   weight_decay=0.01,
   save_strategy="epoch",
   gradient_accumulation_steps=2,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [12]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 935
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 7.76 GiB total capacity; 5.50 GiB already allocated; 19.75 MiB free; 5.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Optuna Hyper Search

In [9]:
import datasets
import optuna
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import StratifiedKFold
import numpy as np
from datasets import load_metric, ClassLabel


In [4]:
# Make the kfold object
folds = StratifiedKFold(n_splits=5)
# Then get the dataset
data = datasets.load_dataset('csv', data_files={'train':'sentimentAnnotations_CSV/annotation_450.csv'}).shuffle()

# Splits based off labels.
splits = folds.split(np.zeros(data["train"].num_rows), data["train"]["label"])

Using custom data configuration default-947c940d5fb73b0e
Reusing dataset csv (/home/lolai/.cache/huggingface/datasets/csv/default-947c940d5fb73b0e/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

def compute_metrics2(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average="macro")
    precision = precision_score(y_true=labels, y_pred=pred, average="macro")
    f1 = f1_score(y_true=labels, y_pred=pred, average="macro")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

scores = {}
# Override train/test
for split in splits:
    splitx = split
data = None
data = datasets.load_dataset('csv', data_files={'train':'sentimentAnnotations_CSV/annotation_450.csv'})
data['test'] = data['train'].select(splitx[1])
data['train'] = data['train'].select(splitx[0])
labels = ClassLabel(num_classes=3, names=['positive', 'negative', 'neutral'])

def preprocess_function(batch):
    tokens = tokenizer(batch['text'], padding='max_length', truncation=True)
    tokens['label'] = labels.str2int(batch['label'])
    return tokens
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_data = data.map(preprocess_function, batched=True,
                          remove_columns=data["train"].column_names
                          )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Using custom data configuration default-947c940d5fb73b0e
Reusing dataset csv (/home/lolai/.cache/huggingface/datasets/csv/default-947c940d5fb73b0e/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /home/lolai/.cache/huggingface/transformers/0e1bbfda7f63a99bb

  0%|          | 0/1 [00:00<?, ?ba/s]

In [26]:
def objective(trial: optuna.Trial):
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
    training_args = TrainingArguments(output_dir='ade-test',
                                      learning_rate=trial.suggest_loguniform('learning_rate', low=4e-5, high=0.01),
                                      weight_decay=trial.suggest_loguniform('weight_decay', 4e-5, 0.01),
                                      num_train_epochs=trial.suggest_int('num_train_epochs', low = 2,high = 5),
                                    per_device_train_batch_size=8,
                                    per_device_eval_batch_size=8,
                                    disable_tqdm=True)
    trainer = Trainer(model=model,
                        args=training_args,
                        train_dataset=tokenized_data['train'],
                        eval_dataset=tokenized_data['test'],
                        tokenizer=tokenizer,
                        data_collator=data_collator,
                        )
    result = trainer.train()
    return result.training_loss

In [27]:
study = optuna.create_study(study_name='hyper-parameter-search', direction='minimize')
study.optimize(func=objective, n_trials=15)
print(study.best_value)
print(study.best_params)
print(study.best_trial)

[32m[I 2022-07-09 00:26:51,939][0m A new study created in memory with name: hyper-parameter-search[0m
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos

{'train_runtime': 41.4512, 'train_samples_per_second': 45.355, 'train_steps_per_second': 5.669, 'train_loss': 0.9095260295462101, 'epoch': 5.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 24.8442, 'train_samples_per_second': 45.403, 'train_steps_per_second': 5.675, 'train_loss': 0.904352661565686, 'epoch': 3.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 32.5093, 'train_samples_per_second': 46.264, 'train_steps_per_second': 5.783, 'train_loss': 0.8182286201639378, 'epoch': 4.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 15.5855, 'train_samples_per_second': 48.25, 'train_steps_per_second': 6.031, 'train_loss': 0.8116566272492104, 'epoch': 2.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 24.2444, 'train_samples_per_second': 46.526, 'train_steps_per_second': 5.816, 'train_loss': 0.8943296391913231, 'epoch': 3.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 30.8911, 'train_samples_per_second': 48.687, 'train_steps_per_second': 6.086, 'train_loss': 0.90737687780502, 'epoch': 4.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 39.4415, 'train_samples_per_second': 47.666, 'train_steps_per_second': 5.958, 'train_loss': 0.8909566676363032, 'epoch': 5.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 23.7844, 'train_samples_per_second': 47.426, 'train_steps_per_second': 5.928, 'train_loss': 0.6193275181114251, 'epoch': 3.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 24.6678, 'train_samples_per_second': 45.728, 'train_steps_per_second': 5.716, 'train_loss': 0.5931765508989916, 'epoch': 3.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 23.2888, 'train_samples_per_second': 48.435, 'train_steps_per_second': 6.054, 'train_loss': 0.5813875671819593, 'epoch': 3.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 16.1414, 'train_samples_per_second': 46.588, 'train_steps_per_second': 5.824, 'train_loss': 1.4581470895320812, 'epoch': 2.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 23.2503, 'train_samples_per_second': 48.516, 'train_steps_per_second': 6.064, 'train_loss': 0.5749863969518784, 'epoch': 3.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 15.438, 'train_samples_per_second': 48.711, 'train_steps_per_second': 6.089, 'train_loss': 0.689926472116024, 'epoch': 2.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 30.8798, 'train_samples_per_second': 48.705, 'train_steps_per_second': 6.088, 'train_loss': 0.4329967904598155, 'epoch': 4.0}


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/lolai/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

load

{'train_runtime': 30.8624, 'train_samples_per_second': 48.732, 'train_steps_per_second': 6.092, 'train_loss': 0.4407879849697681, 'epoch': 4.0}
0.4329967904598155
{'learning_rate': 0.00011715503310767902, 'weight_decay': 5.399698478570332e-05, 'num_train_epochs': 4}
FrozenTrial(number=13, values=[0.4329967904598155], datetime_start=datetime.datetime(2022, 7, 9, 0, 32, 44, 396380), datetime_complete=datetime.datetime(2022, 7, 9, 0, 33, 16, 562733), params={'learning_rate': 0.00011715503310767902, 'weight_decay': 5.399698478570332e-05, 'num_train_epochs': 4}, distributions={'learning_rate': LogUniformDistribution(high=0.01, low=4e-05), 'weight_decay': LogUniformDistribution(high=0.01, low=4e-05), 'num_train_epochs': IntUniformDistribution(high=5, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=13, state=TrialState.COMPLETE, value=None)


In [28]:
study.best_params

{'learning_rate': 0.00011715503310767902,
 'weight_decay': 5.399698478570332e-05,
 'num_train_epochs': 4}

# Classify Dataset

In [1]:
from sentiment_classifier import classify_sent
import pandas as pd
import datasets

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 27% | 23% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 40% | 25% |


In [2]:
train_tsv = pd.read_csv('scicite/tsv/train.tsv', sep='\t',
                       names=["citingPaperID", "source", "string", "label"]
                       )
test_tsv = pd.read_csv('scicite/tsv/test.tsv', sep='\t',
                       names=["citingPaperID", "source", "string", "label"]
                       )

In [12]:
train_dataset = datasets.load_dataset('csv', data_files={'train':'scicite/csv/train.csv'})
dev_dataset = datasets.load_dataset('csv', data_files={'train':'scicite/csv/dev.csv'})
test_dataset = datasets.load_dataset('csv', data_files={'train':'scicite/csv/test.csv'})

Using custom data configuration default-ccba29a05905e5d2
Reusing dataset csv (/home/lolai/.cache/huggingface/datasets/csv/default-ccba29a05905e5d2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-c286d4a61149d838


Downloading and preparing dataset csv/default to /home/lolai/.cache/huggingface/datasets/csv/default-c286d4a61149d838/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/lolai/.cache/huggingface/datasets/csv/default-c286d4a61149d838/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-8cc3c7744943e4b7


Downloading and preparing dataset csv/default to /home/lolai/.cache/huggingface/datasets/csv/default-8cc3c7744943e4b7/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/lolai/.cache/huggingface/datasets/csv/default-8cc3c7744943e4b7/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8243
    })
})

In [13]:
train_sents = classify_sent(train_dataset['train']['text'], "./Sentiment_Full_Model/model")
dev_sents = classify_sent(dev_dataset['train']['text'], "./Sentiment_Full_Model/model")
test_sents = classify_sent(test_dataset['train']['text'], "./Sentiment_Full_Model/model")

In [15]:
train_dataset = train_dataset['train'].add_column("sentiment", train_sents)
dev_dataset = dev_dataset['train'].add_column("sentiment", dev_sents)
test_dataset = test_dataset['train'].add_column("sentiment", test_sents)

In [16]:
test_dataset[1]

{'ID': 'fa7145adc9f8cfb8af7a189d9040c13c84ced094>20e23b4f76761d246a7c3b00b80e139e2008f77d_0',
 'explicit': 'explicit',
 'text': 'In addition the result of the present study supports previous studies which did not find increased rates of first-born children among individual with OCD (203134).',
 'label': 'result',
 'sentiment': 'positive'}

In [17]:
train_dataset.to_csv("train_sent.csv")
dev_dataset.to_csv("dev_sent.csv")
test_dataset.to_csv("test_sent.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

623996