Adapted from: https://mccormickml.com/2019/07/22/BERT-fine-tuning/

<a href="https://colab.research.google.com/github/DerwenAI/spaCy_tuTorial/blob/master/BERT_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [27]:
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import spacy
from spacy.tokens import Span
from spacy.tokens import DocBin
from numpy import random
import spacy_transformers
np.random.seed(42) 

% matplotlib inline

UsageError: Line magic function `%` not found.


# Dataset 1: Argument detect

In [28]:
df = pd.read_csv("processed_data/dataset_arg_detect_wiki.csv")

# Create sentence and label lists
sentences = df.data.values
#Create the labels
labels = df.target.values

# fine tune the existing model
nlp = spacy.load('en_core_web_trf')
argument_with_labels = list(zip(sentences,labels))

#Shuffle and split the data
random.shuffle(argument_with_labels)
train_data = argument_with_labels[:int(len(argument_with_labels)*0.8)]
dev_data = argument_with_labels[int(len(argument_with_labels)*0.8):]

pd.DataFrame(train_data, columns = ['feature', 'target']).to_csv('processed_data/arg_detect_train.csv')
pd.DataFrame(dev_data, columns = ['feature', 'target']).to_csv('processed_data/arg_detect_test.csv')

ImportError: cannot import name 'FilePath' from 'pandas._typing' (/Users/danielbradley/opt/anaconda3/lib/python3.8/site-packages/pandas/_typing.py)

In [42]:
# User function for converting the train and test dataset into spaCy document

def document(data):
#Creating empty list called "text"  
  text = []
  for doc, label in nlp.pipe(data, as_tuples = True):
    if label == 1:
      doc.cats['LABEL'] = 1.0
      doc.cats['NOT_LABEL'] = 0.0
    else:
      doc.cats['LABEL'] = 0.0
      doc.cats['NOT_LABEL'] = 1.0
#Adding the doc into the list 'text'
    text.append(doc)
  return(text)

In [48]:
# Calculate the time for converting into binary document for train dataset

start_time = datetime.now()

#passing the train dataset into function 'document'
train_docs = document(train_data)

#Creating binary document using DocBin function in spaCy
doc_bin = DocBin(docs = train_docs)

#Saving the binary document as train.spacy
doc_bin.to_disk("processed_data/arg_detect/train.spacy")
end_time = datetime.now()

#Printing the time duration for train dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 4:45:36.125913


In [50]:
# Calculate the time for converting into binary document for test dataset

start_time = datetime.now()

#passing the test dataset into function 'document'
test_docs = document(dev_data)
doc_bin = DocBin(docs = test_docs)
doc_bin.to_disk("processed_data/arg_detect/valid.spacy")
end_time = datetime.now()

#Printing the time duration for test dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:32:28.519081


# Dataset 2: Argument quality

# Dataset 2: Claim stance

In [8]:
#3: Argument stance detection dataset
dataset_arg_stance = pd.read_csv('data/IBM_Debater_(R)_CS_EACL-2017.v1/claim_stance_dataset_v1.csv')
stance_df = dataset_arg_stance[['topicText','claims.stance','claims.claimCorrectedText']]

In [16]:
#Define the feature and target
stance_df['feature'] = stance_df['claims.claimCorrectedText']+' '+stance_df['topicText']
stance_df['target'] = stance_df['claims.stance'].apply(lambda x: 0 if x == 'CON' else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stance_df['feature'] = stance_df['claims.claimCorrectedText']+' '+stance_df['topicText']


In [24]:
preprocessed_stance_df = stance_df[['feature', 'target']]

In [25]:
preprocessed_stance_df

Unnamed: 0,feature,target
0,Exposure to violent video games causes at leas...,1
1,video game violence is not related to serious ...,0
2,some violent video games may actually have a p...,0
3,exposure to violent video games causes both sh...,1
4,Violent video games increase the violent tende...,1
...,...,...
2389,democracies have ever been found incompatible ...,0
2390,democracy cannot subsist long nor be carried f...,0
2391,Democracy in general is criticized for ignorin...,0
2392,democracy and freedom are indispensable ingred...,1


In [29]:
df = pd.read_csv("processed_data/dataset_arg_detect_wiki.csv")

# Create sentence and label lists
sentences = list(preprocessed_stance_df['feature'])
#Create the labels
labels = list(preprocessed_stance_df['target'])

#Zip them together for use as a tuple
argument_with_labels = list(zip(sentences,labels))

#Shuffle and split the data
random.shuffle(argument_with_labels)
train_data = argument_with_labels[:int(len(argument_with_labels)*0.8)]
dev_data = argument_with_labels[int(len(argument_with_labels)*0.8):]

In [40]:
#Ran this once to create test and train sets
# pd.DataFrame(train_data, columns = ['feature', 'target']).to_csv('processed_data/claim_stance/train.csv')
# pd.DataFrame(dev_data, columns = ['feature', 'target']).to_csv('processed_data/claim_stance/test.csv')

In [44]:
#Load the nlp model
nlp = spacy.load('en_core_web_trf')

In [45]:
# Calculate the time for converting into binary document for train dataset

start_time = datetime.now()

#passing the train dataset into function 'document'
train_docs = document(train_data)

#Creating binary document using DocBin function in spaCy
doc_bin = DocBin(docs = train_docs)

#Saving the binary document as train.spacy
doc_bin.to_disk("processed_data/claim_stance/train.spacy")
end_time = datetime.now()

#Printing the time duration for train dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:07:07.360077


In [46]:
# Calculate the time for converting into binary document for test dataset

start_time = datetime.now()

#passing the test dataset into function 'document'
test_docs = document(dev_data)
doc_bin = DocBin(docs = test_docs)
doc_bin.to_disk("processed_data/claim_stance/valid.spacy")
end_time = datetime.now()

#Printing the time duration for test dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:01:50.709221
