In [2]:
import numpy as np
import pandas as pd

In [2]:
### Read in training dataset (TSV)
# Filepath will be different; set to repo's filepath when we add the corpus files
train_tsv = pd.read_csv('scicite/tsv/train.tsv', sep='\t', 
                       names=["citingPaperID", "source", "string", "label"]
                       )
train_tsv.head()

Unnamed: 0,citingPaperID,source,string,label
0,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,explicit,"However, how frataxin interacts with the Fe-S ...",background
1,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,explicit,"In the study by Hickey et al. (2012), spikes w...",background
2,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,explicit,"The drug also reduces catecholamine secretion,...",background
3,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,explicit,By clustering with lowly aggressive close kin ...,background
4,88b86556857f4374842d2af2e359576806239175>a5bb0...,explicit,Ophthalmic symptoms are rare manifestations of...,background


# First set of sentiment annotations (Week of Jun. 27, 2022)

In [5]:
### Getting samples of the data for sentiment annotating
train_sentiment = train_tsv.copy()

# Chris' subset
c_sample = list(train_sentiment['string'].sample(n=50, random_state=1).index)
# Drop
train_sentiment = train_sentiment.drop(c_sample)

# Daniel's subset
d_sample = list(train_sentiment['string'].sample(n=50, random_state=1).index)
# Drop
train_sentiment = train_sentiment.drop(d_sample)

# Mutual subset (for IAA score)
m_sample = list(train_sentiment['string'].sample(n=25, random_state=1).index)
train_sentiment = train_sentiment.drop(m_sample)

In [None]:
# Chris' subset in DataFrame form (for adding a sentiment column/feature)
c_df = train_tsv.copy()
c_df = c_df.iloc[c_sample]
c_df['sentiment'] = ['']*50
c_df.to_csv('sentimentAnnotations_CSV/blankAnnotations_C_1.csv')
c_df.head()

In [3]:
# Reading in Chris' subset (after annotating manually)
c_df_annotated = pd.read_csv('sentimentAnnotations_CSV/annotated_C_1.csv')
c_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,2305,f6e7d88342d23cdc6dc959b16106b6bb05fa807a>0fee7...,explicit,"Utilizing gain-offunction assays, we demonstra...",method,positive
1,3210,51439b498621806aa3a915fdbd4aa983473da397>6eb72...,explicit,"maticity, refractoriness and conduction of the...",background,neutral
2,4484,64cf98067d1cadda174e5ffa24fc45a6b3e6426f>2d04a...,explicit,"For reovirus 1/L-induced ARDS, at day 9 postin...",background,neutral
3,5821,9b6d433ab5104713ee41fbeb24b4d0f2b82fe587>95158...,explicit,"Besides that, no ameliorate impacts have been ...",background,negative
4,3514,15ac5bcd9ca900dc948e969558670084e3d560e0>5297c...,explicit,"However, limited information is available abou...",background,negative


In [None]:
# Daniel's subset in DataFrame form (for adding a sentiment column/feature)
d_df = train_tsv.copy()
d_df = d_df.iloc[d_sample]
d_df['sentiment'] = ['']*50
d_df.to_csv('sentimentAnnotations_CSV/blankAnnotations_D_1.csv')
d_df.head()

In [4]:
# Reading in Daniel's subset (after annotating manually)
d_df_annotated = pd.read_csv('sentimentAnnotations_CSV/annotated_D_1.csv')
d_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,3923,6828a9cc19290ccd8cc99ccc678a042b00d92125>26cad...,explicit,The relationship between DNA content and absor...,background,positive
1,4306,5cecd1d9932d3f269b1caa45d07e84e3376c3710>9a0f5...,explicit,With respect to the contradicting results of o...,result,negative
2,4960,1890ccd4d2a4d542ba24081b593ab8060e38f101>12732...,explicit,We elected to take this course because formal ...,method,neutral
3,6921,e2e9ce75aab15707ff3b85378b61f161c0b7886a>39688...,explicit,Nuclear localization of PTOV1 is required for ...,background,neutral
4,3526,afe94f58975df56a9c0450c8153c2dd43d0b597b>ec694...,explicit,The remaining medial portion of the bone was p...,method,neutral


In [None]:
# Mutual subset in DataFrame form (for adding IAA sentiment annotations)
m_df = train_tsv.copy()
m_df = m_df.iloc[m_index]
m_df['sentiment_chris'] = ['']*25
m_df['sentiment_daniel'] = ['']*25
m_df.to_csv('sentimentAnnotations_CSV/blankAnnotations_M_1.csv')
m_df.head()

In [5]:
# Reading in first mutual annotations subset
m_df_annotated = pd.read_csv('sentimentAnnotations_CSV/annotated_M_1.csv')
m_df_annotated.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral


# Second set of sentiment annotations (Week of Jul. 4, 2022)

In [6]:
### Second sets of random samples for sentiment annotations
train_sentiment_2 = train_sentiment.copy()

# Chris' second subset
c_sample_2 = list(train_sentiment_2['string'].sample(n=100, random_state=1).index)
# Drop
train_sentiment_2 = train_sentiment_2.drop(c_sample_2)

# Daniel's second subset
d_sample_2 = list(train_sentiment_2['string'].sample(n=200, random_state=1).index)
# Drop
train_sentiment_2 = train_sentiment_2.drop(d_sample_2)

# Second mutual subset (for IAA score)
m_sample_2 = list(train_sentiment_2['string'].sample(n=25, random_state=1).index)
train_sentiment_2 = train_sentiment_2.drop(m_sample_2)

In [None]:
# Chris' second subset in DataFrame form (for adding a sentiment column/feature)
c_df_2 = train_tsv.copy()
c_df_2 = c_df_2.iloc[c_sample_2]
c_df_2['sentiment'] = ['']*100
c_df_2.to_csv('sentimentAnnotations_CSV/blankAnnotations_C_2.csv')
c_df_2.head()

In [5]:
# Reading in Chris' second subset (after annotating manually)
    ###Uncomment lines below after creating your second annotated subset
c_df_annotated_2 = pd.read_csv('sentimentAnnotations_CSV/annotated_C_2.csv')
c_df_annotated_2.head()

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment
0,3344,5fa7cd8dfc1907789705613dda8328f7f6ecfc53>ad9a2...,properNoun,Cases were identified and clinical data collec...,method,neutral
1,5322,410376c6a35d3dd09a544e0b334ac4e11e22da52>bc09d...,explicit,"[5,6,13,14] The abdominal superficial venous d...",background,positive
2,5387,c20db8a8cd0649dd3644a897404c8d5100b7c707>76ae7...,explicit,be involved in protein binding (especially of ...,background,neutral
3,1793,e300bde47ff38b78bd09fc3566e9c3b5a0aa21e5>6cd19...,explicit,We began the analysis with open coding that in...,method,neutral
4,776,ca4d2aef7421153f0f2c93db54a3e06a4942d329>a0ecd...,explicit,Figure 3 shows the fit of the model to the dyn...,background,neutral


In [None]:
# Daniel's second subset in DataFrame form (for adding a sentiment column/feature)
d_df_2 = train_tsv.copy()
d_df_2 = d_df_2.iloc[d_sample_2]
d_df_2['sentiment'] = ['']*200
d_df_2.to_csv('sentimentAnnotations_CSV/blankAnnotations_D_2.csv')
d_df_2.head()

In [None]:
# Reading in Daniel's second subset (after annotating manually)
    ### Uncomment lines below after creating your second annotated subset
#d_df_annotated_2 = pd.read_csv('sentimentAnnotations_CSV/annotated_D_2.csv')
#d_df_annotated_2.head()

In [None]:
# Second mutual subset in DataFrame form (for adding IAA sentiment annotations)
m_df_2 = train_tsv.copy()
m_df_2 = m_df_2.iloc[m_sample_2]
m_df_2['sentiment_chris'] = ['']*25
m_df_2['sentiment_daniel'] = ['']*25
m_df_2.to_csv('sentimentAnnotations_CSV/blankAnnotations_M_2.csv')
m_df_2.head()

In [None]:
# Reading in second mutual annotations subset
    ### Uncomment after annotating the second mutual subset
#m_df_annotated_2 = pd.read_csv('sentimentAnnotations_CSV/annotated_M_2.csv')
#m_df_annotated_2.head()

# Joining annotated data together

In [6]:
annotated = pd.concat([c_df_annotated, d_df_annotated])
    ### Comment above line and uncomment lines below after all annotations have been completed
#annotated = pd.concat([c_df_annotated, d_df_annotated, c_df_annotated_2, d_df_annotated_2])
#annotated_mutual = pd.concat([m_df_annotated, m_df_annotated_2])

In [None]:
# Calculating IAA score (Cohen's kappa)
mutual_chris = list(m_df_annotated['sentiment_chris'])
mutual_daniel = list(m_df_annotated['sentiment_daniel'])

p_o = 0
for i in np.arange(25):
    if mutual_chris[i] == mutual_daniel[i]:
        p_o += 1
p_o /= 25

p_e = 0
for value in ['positive', 'neutral', 'negative']:
    p_sentiment = (mutual_chris.count(value)/25) * (mutual_daniel.count(value)/25)
    p_e += p_sentiment

kappa = (p_o - p_e) / (1 - p_e)
kappa

# Applying sentiment classifiers

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
m_df_annotated['pos'],m_df_annotated['neg'],m_df_annotated['neu'] = 0,0,0

In [None]:
a=m_df_annotated['string'][0]

In [None]:
output = sid.polarity_scores("Hello there")

In [97]:
# Calculating IAA score (Cohen's kappa)
mutual_chris = list(m_df_annotated['sentiment_chris'])
mutual_daniel = list(m_df_annotated['sentiment_daniel'])

p_o = 0
for i in np.arange(25):
    if mutual_chris[i] == mutual_daniel[i]:
        p_o += 1
p_o /= 25

p_e = 0
for value in ['positive', 'neutral', 'negative']:
    p_sentiment = (mutual_chris.count(value)/25) * (mutual_daniel.count(value)/25)
    p_e += p_sentiment

kappa = (p_o - p_e) / (1 - p_e)
kappa

0.7005988023952094

# Applying sentiment classifiers

In [3]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/lolai/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
sid = SentimentIntensityAnalyzer()

In [23]:
m_df_annotated['pos'],m_df_annotated['neg'],m_df_annotated['neu'] = 0,0,0

In [30]:
a=m_df_annotated['string'][0]

In [8]:
output = sid.polarity_scores("Hello there")

In [9]:
m_df_annotated['sc ores'] = m_df_annotated['string'].apply(lambda string: sid.polarity_scores(string))

In [10]:
m_df_annotated

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel,sc ores
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral,"{'neg': 0.176, 'neu': 0.621, 'pos': 0.203, 'co..."
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral,"{'neg': 0.0, 'neu': 0.938, 'pos': 0.062, 'comp..."
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral,"{'neg': 0.0, 'neu': 0.931, 'pos': 0.069, 'comp..."
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp..."
5,2543,2c0cab0d657f6c225c74736fed02750c381032de>02cdb...,explicit,These results are similar to other reports [34...,result,positive,positive,"{'neg': 0.059, 'neu': 0.88, 'pos': 0.062, 'com..."
6,8093,7dda92d7dbb1a5f7fc78bba6f81890b3832cb2d6>e6f6f...,explicit,A type I T-cell gene signature that promotes b...,background,positive,positive,"{'neg': 0.131, 'neu': 0.73, 'pos': 0.139, 'com..."
7,5365,9237db18530d0de2e0281e735c10570d7c235b36>14682...,explicit,"Resistance to TB involves macrophages [2,4], d...",background,positive,positive,"{'neg': 0.0, 'neu': 0.948, 'pos': 0.052, 'comp..."
8,4663,1ac9706400e1c06a100add4b83a67af503404d8d>1fc59...,explicit,While the prevalence of hyperechogenicity is o...,background,positive,neutral,"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou..."
9,3065,4f4c6a035c0c28cfe5855a720a8f7a16a401e9f8>0451f...,explicit,There have been few animal studies on this sub...,background,negative,negative,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [17]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/lolai/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/lolai/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [23]:
for index, row in m_df_annotated.iterrows():
    blob = TextBlob(row['string'], analyzer=NaiveBayesAnalyzer())
    m_df_annotated.at[index, 'blob'] = blob.sentiment[1]

In [27]:
m_df_annotated['blob'] = m_df_annotated['string'].apply(lambda string: TextBlob(string).polarity)

In [28]:
m_df_annotated

Unnamed: 0.1,Unnamed: 0,citingPaperID,source,string,label,sentiment_chris,sentiment_daniel,sc ores,blob
0,7792,0244de3610992df3e77b65de794861810a673a48>89634...,explicit,The self-report component measures interpretat...,background,neutral,neutral,"{'neg': 0.176, 'neu': 0.621, 'pos': 0.203, 'co...",-0.036364
1,3733,f98fa4590950e29e660b3d3179a7bb570ee4e132>cf4b7...,explicit,The age of giant nupDNA fragment A was calcula...,method,neutral,neutral,"{'neg': 0.0, 'neu': 0.938, 'pos': 0.062, 'comp...",0.1
2,1237,bc103d96366ec97e0dd620894bbb04c8849eb772>919a7...,explicit,Two representative software of this type are s...,method,neutral,neutral,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
3,7052,e3450e8ca6d743f04a1ad41297ab59eeb82299b4>b5fcd...,explicit,How are Bcl-2 and Mcl-1 levels regulated in a ...,background,neutral,neutral,"{'neg': 0.0, 'neu': 0.931, 'pos': 0.069, 'comp...",-0.051852
4,6524,8ebd0c64a3f1833cac7b8191303ea2bbd05682b5>86fde...,explicit,"Two traits are orthogonal when, based on indiv...",background,neutral,neutral,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp...",-0.144345
5,2543,2c0cab0d657f6c225c74736fed02750c381032de>02cdb...,explicit,These results are similar to other reports [34...,result,positive,positive,"{'neg': 0.059, 'neu': 0.88, 'pos': 0.062, 'com...",-0.0625
6,8093,7dda92d7dbb1a5f7fc78bba6f81890b3832cb2d6>e6f6f...,explicit,A type I T-cell gene signature that promotes b...,background,positive,positive,"{'neg': 0.131, 'neu': 0.73, 'pos': 0.139, 'com...",0.0
7,5365,9237db18530d0de2e0281e735c10570d7c235b36>14682...,explicit,"Resistance to TB involves macrophages [2,4], d...",background,positive,positive,"{'neg': 0.0, 'neu': 0.948, 'pos': 0.052, 'comp...",0.0
8,4663,1ac9706400e1c06a100add4b83a67af503404d8d>1fc59...,explicit,While the prevalence of hyperechogenicity is o...,background,positive,neutral,"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou...",0.34375
9,3065,4f4c6a035c0c28cfe5855a720a8f7a16a401e9f8>0451f...,explicit,There have been few animal studies on this sub...,background,negative,negative,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",-0.183333


# Train sentiment classifier on annotated data BERT

In [1]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DataCollatorWithPadding
import datasets

In [2]:
imdb = datasets.load_dataset("imdb")
imdb

Reusing dataset imdb (/home/lolai/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [3]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])


Loading cached shuffled indices for dataset at /home/lolai/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-8a9e43a6ac4acdff.arrow
Loading cached shuffled indices for dataset at /home/lolai/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-2eff9f118d84c6fe.arrow


In [53]:
bert_pd = pd.DataFrame()
bert_pd['text'], bert_pd['label'] = annotated['string'], annotated['sentiment']
bert_pd

Unnamed: 0,text,label
0,"Utilizing gain-offunction assays, we demonstra...",positive
1,"maticity, refractoriness and conduction of the...",neutral
2,"For reovirus 1/L-induced ARDS, at day 9 postin...",neutral
3,"Besides that, no ameliorate impacts have been ...",negative
4,"However, limited information is available abou...",negative
...,...,...
45,A recent review by Kazan et al depicted curren...,positive
46,"…is variation among ecosystems, the most commo...",positive
47,"Thirdly, ErbB-3 is characterized by a defectiv...",neutral
48,"These clusters of informative voxels, along wi...",positive


In [54]:
bert_pd_test = pd.DataFrame()
bert_pd_test['text'], bert_pd_test['label'] = m_df_annotated['string'], m_df_annotated['sentiment_chris']
bert_pd_test

Unnamed: 0,text,label
0,The self-report component measures interpretat...,neutral
1,The age of giant nupDNA fragment A was calcula...,neutral
2,Two representative software of this type are s...,neutral
3,How are Bcl-2 and Mcl-1 levels regulated in a ...,neutral
4,"Two traits are orthogonal when, based on indiv...",neutral
5,These results are similar to other reports [34...,positive
6,A type I T-cell gene signature that promotes b...,positive
7,"Resistance to TB involves macrophages [2,4], d...",positive
8,While the prevalence of hyperechogenicity is o...,positive
9,There have been few animal studies on this sub...,negative


In [57]:
ds_train = datasets.Dataset.from_pandas(bert_pd)
ds_train = ds_train.remove_columns('__index_level_0__')
ds_train

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})

In [58]:
ds_test = datasets.Dataset.from_pandas(bert_pd_test)
ds_test

Dataset({
    features: ['text', 'label'],
    num_rows: 25
})

In [74]:
ds_train['text']

['Utilizing gain-offunction assays, we demonstrate that enhancing actin polymer-\\nthese proteins does not increase SV density (Scheiffele et al., 2000; Bamji et al., 2003; Sara et al., 2005; Latefi et al., 2009; Linhoff et al., 2009).',
 'maticity, refractoriness and conduction of the heart are conserved between zebrafish and humans [44, 46].',
 'For reovirus 1/L-induced ARDS, at day 9 postinoculation, there is a severe pneumonia (peribronchiolar lesions with lymphocytic infiltration) with the presence of hyaline membranes, which are pathognomonic for human ARDS (20, 21).',
 'Besides that, no ameliorate impacts have been obtained on the use of antioxidants in some studies (12,13).',
 'However, limited information is available about molecular mechanisms of HM tolerance and detoxification in trees, and most of HM related genes in woody plants are restricted to Populus (Luo et al., 2016).',
 'A recent qualitative study explored the views of individuals living with SCI towards the descrip

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True, padding=True)

"""tokenized_train = ds_train.map(preprocess_function, batched=True,
                               #remove_columns=ds_train.column_names
                               )
tokenized_test = ds_test.map(preprocess_function, batched=True,
                             #remove_columns=ds_test.column_names
                             )"""
tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [83]:
tokenized_train

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 3000
})

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [7]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [11]:
training_args = TrainingArguments(
   output_dir="bert_test",
   learning_rate=2e-5,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=8,
   num_train_epochs=5,
   weight_decay=0.01,
   save_strategy="epoch",
   gradient_accumulation_steps=2,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [12]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 935
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 7.76 GiB total capacity; 5.50 GiB already allocated; 19.75 MiB free; 5.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF