<a href="https://colab.research.google.com/github/ChrisBagdon/Citation_Classification/blob/main/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install/import libraries
#!pip install transformers
#!pip install datasets
import transformers
import pandas as pd
import datasets

# Setting up the BERT models

In [None]:
# Import pretrained distilbert tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Use ClassLabel object as labels
from datasets.features.features import ClassLabel
labels = ClassLabel(num_classes=3, names=['background', 'result', 'method'])

In [None]:
### Preprocess function
# Required to tokenize and batch data
def preprocess_function(batch):
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    tokens['label'] = labels.str2int(batch['label'])
    return tokens

In [None]:
# Form batches with padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Evaluation function
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

### Compute metrics
# Returns argmax predictions based on given predictions
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Apply BERT model to only citation texts

In [None]:
# Read in datasets (CSVs)
data_path = "scicite/csv/"
data_text = datasets.load_dataset('csv', data_files={'train':data_path+'train.csv', 'test':data_path+'dev.csv'})
data_text['train'][1]

In [None]:
# Tokenize and batch data
tokenized_data_text = data_text.map(preprocess_function, batched=True)

In [None]:
# Instantiate RoBERTa model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

text_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

In [None]:
# Training arguments for text model
training_args_text = TrainingArguments(
    output_dir="./results-Roberta",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    remove_unused_columns=True,
    evaluation_strategy="epoch"
    )

# Instantiate Trainer for text model
trainer_text = Trainer(
    model=text_model,
    args=training_args_text,
    train_dataset=tokenized_data_text["train"],
    eval_dataset=tokenized_data_text["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,)

In [None]:
trainer_text.train()

In [None]:
# Instantiate classifier
from transformers import pipeline
classifier = pipeline(task='text-classification', model="./results/checkpoint-2500", tokenizer=tokenizer)

In [None]:
import csv
with open('scicite/tsv/dev.tsv') as dev_file_text:
    dev_data_text = csv.reader(dev_file_text, delimiter="\t")
    X_dev_text, Y_dev_text = [],[]
    for row in dev_data_text:
        X_dev_text.append(row[2])
        Y_dev_text.append(row[3])

In [None]:
predictions = classifier(X_dev_text)

In [None]:
labels = {'LABEL_0':'background', 'LABEL_1':'result', 'LABEL_2':'method'}
preds = [labels[x['label']] for x in predictions]

In [None]:
import numpy as np
import pandas as pd
def evaluate(predictions, gold_standard):
    # Collect all unique labels from predictions and gold_std
    labels_set = set(predictions + gold_standard)
    labels = {}
    for i, label in enumerate(labels_set):
        labels[label] = i
    # Create confusion matrix
    confusion_matrix = np.zeros((len(labels_set),len(labels_set)))
    for pred, gold in zip(predictions, gold_standard):
        confusion_matrix[labels[pred]][labels[gold]] += 1
    labels_index = list(labels_set); labels_index.append('overall')
    columns = []
    # Create scores table
    scores = pd.DataFrame(np.zeros((len(labels_set), 3)))
    scores.columns = ['Precision', 'Recall', 'F1']
    overall_TP = 0
    # Calculate P, R, F1 and populate scores table
    for label in labels_set:
        i = labels[label]
        scores['Precision'][i] = confusion_matrix[i][i] / np.sum(confusion_matrix, axis=0)[i]
        scores['Recall'][i] = confusion_matrix[i][i] / np.sum(confusion_matrix, axis=1)[i]
        # Possible error case: P == 0 == R; divide by 0
        if scores['Precision'][i] == 0 and scores['Recall'][i] == 0:
            scores['F1'][i] = 0
        else:
            scores['F1'][i] = 2 * (scores['Precision'][i]*scores['Recall'][i]/(scores['Precision'][i]+scores['Recall'][i]))
        overall_TP += confusion_matrix[i][i]
    scores.loc[len(labels_set)] = [overall_TP / np.sum(confusion_matrix)] * 3
    scores.index = labels_index
    return (confusion_matrix, scores)

In [None]:
cf, scores = evaluate(preds, Y_dev)

In [None]:
print(scores)
print(cf)

# Applying BERT model to citation data with sentiment labels added

In [None]:
## Append sentiment labels to citation texts for test and dev sets
data_path = "scicite/sentiment_csv/"

# Training set
df_sentiment_train = pd.read_csv(data_path + 'train_sent.csv')
df_sentiment_train['text'] = df_sentiment_train['text'] + ' [SEP] ' + df_sentiment_train['sentiment']
df_sentiment_train.to_csv(data_path + 'train_sent_bert.csv')

# Dev set
df_sentiment_dev = pd.read_csv(data_path + 'dev_sent.csv')
df_sentiment_dev['text'] = df_sentiment_dev['text'] + ' [SEP] ' + df_sentiment_dev['sentiment']
df_sentiment_dev.to_csv(data_path + 'dev_sent_bert.csv')

In [None]:
# Read in datasets (CSVs)
data_sentiment = datasets.load_dataset('csv', data_files={'train':data_path+'train_sent_bert.csv', 'test':data_path+'dev_sent_bert.csv'})
data_sentiment = data_sentiment.remove_columns(['Unnamed: 0', 'Unnamed: 0.1', 'ID', 'explicit', 'sentiment'])
data_sentiment['train'][1]

In [None]:
# Sentiment model's tokenizer
tokenizer_sentiment = AutoTokenizer.from_pretrained("distilbert-base-uncased", sep_token='[SEP]')

### Preprocess function for sentiment model
# Required to tokenize and batch data
def preprocess_function_sentiment(batch):
    tokens = tokenizer_sentiment(batch['text'], padding=True, truncation=True, max_length=128)
    tokens['label'] = labels.str2int(batch['label'])
    return tokens

In [None]:
# Tokenize and batch data
tokenized_data_sentiment = data_sentiment.map(preprocess_function_sentiment, batched=True)

In [None]:
# Instantiate RoBERTa model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

In [None]:
# Training arguments for sentiment model
training_args_sentiment = TrainingArguments(
    output_dir="./results-Roberta-sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    remove_unused_columns=True,
    evaluation_strategy="epoch"
    )

# Instantiate Trainer for text model
trainer_sentiment = Trainer(
    model=sentiment_model,
    args=training_args_sentiment,
    train_dataset=tokenized_data_sentiment["train"],
    eval_dataset=tokenized_data_sentiment["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,)

In [None]:
trainer_sentiment.train()

In [None]:
# Instantiate classifier for sentiment model
sentiment_classifier = pipeline(task='text-classification', model="./results-Roberta-sentiment/checkpoint-2500", tokenizer=tokenizer_sentiment)

In [None]:
# Import dev dataset
with open('scicite/sentiment_csv/dev_sent_bert.csv') as dev_file_sentiment:
    dev_data_sentiment = csv.reader(dev_file_sentiment, delimiter=",")
    next(dev_data_sentiment)
    X_dev_sentiment, Y_dev_sentiment = [],[]
    for row in dev_data_sentiment:
        X_dev_sentiment.append(row[4])
        Y_dev_sentiment.append(row[5])

In [None]:
print(X_dev_sentiment[:5], Y_dev_sentiment[:5])

In [None]:
predictions_sentiment = sentiment_classifier(X_dev_sentiment)

In [None]:
labels = {'LABEL_0':'background', 'LABEL_1':'result', 'LABEL_2':'method'}
preds_sentiment = [labels[x['label']] for x in predictions_sentiment]

In [None]:
cf_sentiment, scores_sentiment = evaluate(preds_sentiment, Y_dev_sentiment)

In [None]:
print(scores_sentiment)
print(cf_sentiment)

# Training DistilBERT model with shuffled sentiment data

In [None]:
## Append shuffled sentiment labels to citation texts for test and dev sets
data_path = "scicite/sentiment_csv/"

# Training set
df_sentiment_train_shuffled = pd.read_csv(data_path + 'train_sent_shuffled.csv')
df_sentiment_train_shuffled['text'] = df_sentiment_train_shuffled['text'] + ' [SEP] ' + df_sentiment_train_shuffled['sentiment']
df_sentiment_train_shuffled.to_csv(data_path + 'train_sent_bert_shuffled.csv')

# Dev set
df_sentiment_dev_shuffled = pd.read_csv(data_path + 'dev_sent_shuffled.csv')
df_sentiment_dev_shuffled['text'] = df_sentiment_dev_shuffled['text'] + ' [SEP] ' + df_sentiment_dev_shuffled['sentiment']
df_sentiment_dev_shuffled.to_csv(data_path + 'dev_sent_bert_shuffled.csv')

In [None]:
# Read in datasets (CSVs)
data_sentiment_shuffled = datasets.load_dataset('csv', data_files={'train':data_path+'train_sent_bert_shuffled.csv', 
                                                                   'test':data_path+'dev_sent_bert_shuffled.csv'})
data_sentiment_shuffled = data_sentiment_shuffled.remove_columns(['Unnamed: 0', 'Unnamed: 0.1', 'ID', 'explicit', 'sentiment'])
data_sentiment_shuffled['train'][1]

In [None]:
# Shuffled sentiment model's tokenizer
tokenizer_sentiment_shuffled = AutoTokenizer.from_pretrained("distilbert-base-uncased", sep_token='[SEP]')

### Preprocess function for shuffled sentiment model
# Required to tokenize and batch data
def preprocess_function_sentiment_shuffled(batch):
    tokens = tokenizer_sentiment_shuffled(batch['text'], padding=True, truncation=True, max_length=128)
    tokens['label'] = labels.str2int(batch['label'])
    return tokens

In [None]:
# Tokenize and batch data
tokenized_data_sentiment_shuffled = data_sentiment_shuffled.map(preprocess_function_sentiment_shuffled, batched=True)

In [None]:
# Instantiate RoBERTa model
shuffled_sentiment_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

In [None]:
# Training arguments for shuffled sentiment model
training_args_sentiment_shuffled = TrainingArguments(
    output_dir="./results-Roberta-sentiment-shuffled",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    remove_unused_columns=True,
    evaluation_strategy="epoch"
    )

# Instantiate Trainer for text model
trainer_sentiment_shuffled = Trainer(
    model=shuffled_sentiment_model,
    args=training_args_sentiment_shuffled,
    train_dataset=tokenized_data_sentiment_shuffled["train"],
    eval_dataset=tokenized_data_sentiment_shuffled["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,)

In [None]:
trainer_sentiment_shuffled.train()

In [None]:
# Instantiate classifier for shuffled sentiment model
shuffled_sentiment_classifier = pipeline(task='text-classification', 
                                         model="./results-Roberta-sentiment-shuffled/checkpoint-2500", 
                                         tokenizer=tokenizer_sentiment_shuffled)

In [None]:
# Import dev dataset
with open('scicite/sentiment_csv/dev_sent_bert_shuffled.csv') as dev_file_sentiment_shuffled:
    dev_data_sentiment_shuffled = csv.reader(dev_file_sentiment_shuffled, delimiter=",")
    next(dev_data_sentiment_shuffled)
    X_dev_sentiment_shuffled, Y_dev_sentiment_shuffled = [],[]
    for row in dev_data_sentiment_shuffled:
        X_dev_sentiment_shuffled.append(row[5])
        Y_dev_sentiment_shuffled.append(row[6])

In [None]:
print(X_dev_sentiment_shuffled[:5], Y_dev_sentiment_shuffled[:5])

In [None]:
predictions_sentiment_shuffled = shuffled_sentiment_classifier(X_dev_sentiment_shuffled)

In [None]:
labels = {'LABEL_0':'background', 'LABEL_1':'result', 'LABEL_2':'method'}
preds_sentiment_shuffled = [labels[x['label']] for x in predictions_sentiment_shuffled]

In [None]:
cf_sentiment_shuffled, scores_sentiment_shuffled = evaluate(preds_sentiment_shuffled, Y_dev_sentiment_shuffled)

In [None]:
print(scores_sentiment_shuffled)
print(cf_sentiment_shuffled)

# Sentiment Classifier

In [None]:
import pandas as pd
import datasets

In [None]:
import gc
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda
gc.collect()
torch.cuda.empty_cache()

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()

In [None]:
data = datasets.load_dataset('csv', data_files={'train':'sentimentAnnotations_CSV/train_anno_2.csv', 'test':'sentimentAnnotations_CSV/test_anno.csv'})

In [None]:
data['train'] = data['train'].rename_columns({'Column1':'ID', 'Column2':'exp', 'Column3':'text', 'Column4':'type', 'Column5':'label'})
data['test'] = data['test'].rename_columns({'Column1':'ID', 'Column2':'exp', 'Column3':'text', 'Column4':'type', 'Column5':'label'})

In [None]:
data

In [None]:
data['train'] = data['train'].remove_columns(['exp', 'ID', 'type'])
data['test'] = data['test'].remove_columns(['exp', 'ID', 'type'])

In [None]:
data['train'][1]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
from datasets.features.features import ClassLabel
labels = ClassLabel(num_classes=3, names=['positive', 'negative', 'neutral'])

In [None]:
def preprocess_function(batch):

    tokens = tokenizer(batch['text'], padding='max_length', truncation=True)
    tokens['label'] = labels.str2int(batch['label'])
    return tokens

In [None]:
tokenized_data = data.map(preprocess_function, batched=True)

In [None]:
tokenized_data

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np

from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

In [None]:
training_args = TrainingArguments(
    output_dir="sent-results-distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    #remove_unused_columns=True,
    evaluation_strategy="epoch"
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from transformers import pipeline
classifier = pipeline(task='text-classification', model='sent-results-distilbert/checkpoint-3000/', tokenizer=tokenizer)

In [None]:
import csv
with open('sentimentAnnotations_CSV/test_anno.csv') as dev_file:
  dev_data = csv.reader(dev_file)
  X_dev, Y_dev = [],[]
  for row in dev_data:
    X_dev.append(row[2])
    Y_dev.append(row[4])
  X_dev = X_dev[1:]
  Y_dev = Y_dev[1:]

In [None]:
predictions = classifier(X_dev)

In [None]:
labels = {'LABEL_0':'positive', 'LABEL_1':'negative', 'LABEL_2':'neutral'}
preds = [labels[x['label']] for x in predictions]

In [None]:
Y_dev

In [None]:
import numpy as np
import pandas as pd
def evaluate(predictions, gold_standard):
    # Collect all unique labels from predictions and gold_std
    labels_set = set(predictions + gold_standard)
    labels = {}
    for i, label in enumerate(labels_set):
        labels[label] = i
    # Create confusion matrix
    confusion_matrix = np.zeros((len(labels_set),len(labels_set)))
    for pred, gold in zip(predictions, gold_standard):
        confusion_matrix[labels[pred]][labels[gold]] += 1
    labels_index = list(labels_set); labels_index.append('overall')
    columns = []
    # Create scores table
    scores = pd.DataFrame(np.zeros((len(labels_set), 3)))
    scores.columns = ['Precision', 'Recall', 'F1']
    overall_TP = 0
    # Calculate P, R, F1 and populate scores table
    for label in labels_set:
        i = labels[label]
        scores['Precision'][i] = confusion_matrix[i][i] / np.sum(confusion_matrix, axis=0)[i]
        scores['Recall'][i] = confusion_matrix[i][i] / np.sum(confusion_matrix, axis=1)[i]
        # Possible error case: P == 0 == R; divide by 0
        if scores['Precision'][i] == 0 and scores['Recall'][i] == 0:
            scores['F1'][i] = 0
        else:
            scores['F1'][i] = 2 * (scores['Precision'][i]*scores['Recall'][i]/(scores['Precision'][i]+scores['Recall'][i]))
        overall_TP += confusion_matrix[i][i]
    scores.loc[len(labels_set)] = [overall_TP / np.sum(confusion_matrix)] * 3
    scores.index = labels_index
    return (confusion_matrix, scores)

In [None]:
cf, scores = evaluate(preds, Y_dev)

In [None]:
print(scores)
print(cf)

In [None]:
data = datasets.load_dataset('csv', data_files={'train':'sentimentAnnotations_CSV/train_anno_2.csv'})

In [None]:
data["train"].num_rows

In [None]:
cross_0 = pd.read_pickle("cross_val_scores_0.pkl")
cross_1 = pd.read_pickle("cross_val_scores_1.pkl")
cross_2 = pd.read_pickle("cross_val_scores_2.pkl")
cross_3 = pd.read_pickle("cross_val_scores_3.pkl")
cross_4 = pd.read_pickle("cross_val_scores_4.pkl")

In [None]:
df_concat = pd.concat([cross_1,cross_0,cross_2,cross_3,cross_4])

In [None]:
by_row_index = df_concat.groupby(df_concat.index)
df_means = by_row_index.mean()

In [None]:
df_means.head()

In [None]:
cross_4

In [None]:
cross_4

In [None]:
print(df_means)