In [1]:
!pip install datasets
!pip install seqeval
!pip install transformers
%pip install huggingface_hub
!pip install git-lfs
!sudo apt-get install git-lfs

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 36.5 MB/s eta 0:00:01[K     |██                              | 20 kB 43.0 MB/s eta 0:00:01[K     |███                             | 30 kB 44.1 MB/s eta 0:00:01[K     |████                            | 40 kB 31.5 MB/s eta 0:00:01[K     |█████                           | 51 kB 30.7 MB/s eta 0:00:01[K     |██████                          | 61 kB 34.6 MB/s eta 0:00:01[K     |███████                         | 71 kB 27.2 MB/s eta 0:00:01[K     |████████                        | 81 kB 28.1 MB/s eta 0:00:01[K     |█████████                       | 92 kB 30.2 MB/s eta 0:00:01[K     |██████████                      | 102 kB 32.1 MB/s eta 0:00:01[K     |███████████                     | 112 kB 32.1 MB/s eta 0:00:01[K     |████████████                    | 122 kB 32.1 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 32.1 MB/s eta

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import re
from datasets import load_dataset
from datasets import Dataset
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, LongformerForTokenClassification, LongformerTokenizerFast
from transformers import DataCollatorForTokenClassification
from datasets import load_metric

In [4]:
# raw_datasets  = load_dataset("conll2003")
model_checkpoint = "bert-base-cased"
longformer_checkpoint = "allenai/longformer-base-4096"
gpt_checkpoint = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
# tokenizer = AutoTokenizer.from_pretrained(longformer_checkpoint, add_prefix_space=True)
# tokenizer = AutoTokenizer.from_pretrained(gpt_checkpoint, add_prefix_space=True)
df_errorIds = pd.read_csv("/content/drive/MyDrive/bigError.csv")
df_bertErrorIds = pd.read_csv("/content/drive/MyDrive/bertError.csv")
df_errorIds = df_errorIds.drop(['Unnamed: 0'], axis = 1)
df_bertErrorIds = df_bertErrorIds.drop(['Unnamed: 0'], axis = 1)
metric = load_metric("seqeval")

tokenDict = {
    "Lead" : 0,
    "Position" : 1,
    "Evidence" : 2,
    "Claim" : 3,
    "Concluding Statement" : 4,
    "Counterclaim" : 5,
    "Rebuttal": 6
}

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [5]:
# helper functions

def fileToArray(file_path):
    # turn text into array of words
    text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    text_ds = text_ds.enumerate()
    docTxt = []
    for i in text_ds.as_numpy_iterator():
        line = i[1].decode().split()
        for l in line:
            docTxt.append(l)
    
    docTxt = ' '.join(docTxt)
    return docTxt

def calc_word_indices(full_text, discourse_start, discourse_end):
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output


def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    try:
        for word_id in word_ids:
            if word_id != current_word:
                # Start of a new word!
                current_word = word_id
                label = -100 if word_id is None else labels[word_id]
                new_labels.append(label)
            elif word_id is None:
                # Special token
                new_labels.append(-100)
            else:
                # Same word as previous token
                label = labels[word_id]
                # If the label is B-XXX we change it to I-XXX
                if label % 2 == 1:
                    label += 1
                new_labels.append(label)
    except:
        print("error")
        return []

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def dfTags_to_ints(df):
    for index, row in df.iterrows():
        intArray = []
        tokenArray = []
        for i in row[2]:
            intArray.append(int(i))
        df.at[index, "ner_tags"] = intArray.copy()
        # for j in row[1]:
        #     if j != '.' and j != ',':
        #         tokenArray.append(j)
        # df.at[index, "tokens"] = tokenArray.copy()
    return df

# def csv_to_df(fileName):
#     return None

In [6]:
def createTokenTrainingSet(fileName):
    errorList = list(df_errorIds["0"])
    bert_errorList = list(df_bertErrorIds["0"])
    df = pd.read_csv(fileName, error_bad_lines=False)

    df['tokens'] = df['tokens'].apply(lambda a: a.split())
    df['ner_tags'] = df['ner_tags'].apply(lambda a: a.split())
    df = df.drop(['Unnamed: 0'], axis = 1)

    errorList_idx = []
    for e in errorList:
        try:
            errorList_idx.append(df.loc[df["id"] == e].index[0])
        except:
            continue
    
    for e in bert_errorList:
        try:
            errorList_idx.append(df.loc[df["id"] == e].index[0])
        except:
            continue
    df = df.drop(errorList_idx, 0)
    df = dfTags_to_ints(df)

    newDataset = Dataset.from_pandas(df)

    print(newDataset)
    tokenized_datasets = newDataset.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=newDataset.column_names
    )
    return tokenized_datasets

    

In [7]:
tokenTrain = createTokenTrainingSet("/content/drive/MyDrive/trainHugging4.csv")



  """Entry point for launching an IPython kernel.


Dataset({
    features: ['id', 'tokens', 'ner_tags', '__index_level_0__'],
    num_rows: 9235
})


  0%|          | 0/10 [00:00<?, ?ba/s]

In [8]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [9]:
tokenTrain = tokenTrain.train_test_split(train_size=0.9, test_size=0.1)
tokenTrain

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8311
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 924
    })
})

In [10]:
label_names = ["Lead", "Position", "Evidence", "Claim", "Concluding Statement", "Counterclaim", "Rebuttal"]
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
i2l = id2label
l2i = label2id

In [11]:
# code by ROB MULLA
# source: https://www.kaggle.com/robikscube/student-writing-competition-twitch#Competition-Metric-Code

def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp_micro(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = (
        gt_df[["id", "discourse_type", "predictionstring"]]
        .reset_index(drop=True)
        .copy()
    )
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    )
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(" ")

    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (joined["overlap2"] >= 0.5)
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = (
        joined.query("potential_TP")
        .sort_values("max_overlap", ascending=False)
        .groupby(["id", "predictionstring_gt"])
        .first()["pred_id"]
        .values
    )

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined["pred_id"].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query("potential_TP")["gt_id"].unique()
    unmatched_gt_ids = [c for c in joined["gt_id"].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microf1
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    return precision, recall, my_f1_score


def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    precision_scores = {}
    recall_scores = {}
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    for discourse_type, gt_subset in gt_df.groupby("discourse_type"):
        pred_subset = (
            pred_df.loc[pred_df["class"] == discourse_type]
            .reset_index(drop=True)
            .copy()
        )
        p_score, r_score, class_score = score_feedback_comp_micro(pred_subset, gt_subset)
        precision_scores[discourse_type] = p_score
        recall_scores[discourse_type] = r_score
        class_scores[discourse_type] = class_score
    precision = np.mean([v for v in precision_scores.values()])
    recall = np.mean([v for v in recall_scores.values()])
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return precision, recall, f1

In [12]:
def intList_to_string(intList):
    pString = ""
    for i in intList:
        pString = pString + str(i) + " "
    return pString

def myEval(true_labels, true_predictions):

    #run evaluation
    id_list = []
    class_list = []
    string_list = []
    for index, l in enumerate(true_predictions):
        pred_dict = []
        curr_type = l[0]
        seqArray = []   
        for idx, i in enumerate(l):
            if i == curr_type:
                seqArray.append(idx)
            else:
                seqArray.append(idx)
                pred_dict.append([curr_type, seqArray])
                curr_type = i
                seqArray = []
        
        if len(seqArray) > 1:
            pred_dict.append([curr_type, seqArray])

        for i in pred_dict:
            id_list.append(index)
            class_list.append(i[0])
            string_list.append(intList_to_string(i[1]))

    p_dict = {'id': id_list,
            'class': class_list,
            'predictionstring':string_list}
    results_df = pd.DataFrame(p_dict)

    id_list = []
    class_list = []
    string_list = []
    for index, l in enumerate(true_labels):
        label_dict = []
        curr_type = l[0]
        seqArray = []   
        for idx, i in enumerate(l):
            if i == curr_type:
                seqArray.append(idx)
            else:
                seqArray.append(idx)
                label_dict.append([curr_type, seqArray])
                curr_type = i
                seqArray = []
                
        for i in label_dict:
            id_list.append(index)
            class_list.append(i[0])
            string_list.append(intList_to_string(i[1]))

    g_dict = {'id': id_list,
            'discourse_type': class_list,
            'predictionstring':string_list}
    gt_df = pd.DataFrame(g_dict)
    precision, recall, f1_score = score_feedback_comp(results_df, gt_df, return_class_scores=False)
    return precision, recall, f1_score

    

In [13]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    precision, recall, f1_score = myEval(true_labels, true_predictions)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": f1_score,
        "accuracy": all_metrics["overall_accuracy"],
    }

In [14]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [15]:
# pushes the model to the huggingFace library
# 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
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
from transformers import TrainingArguments

# TRAINING HYPERPARAMS
BS = 1
GRAD_ACC = 8
LR = 5e-5
WD = 0.01
WARMUP = 0.1
N_EPOCHS = 1

os.environ["WANDB_DISABLED"] = "true"
args = TrainingArguments(
    "bert-test",
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=LR,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WD,
    gradient_accumulation_steps=GRAD_ACC,
    warmup_ratio=WARMUP,
    push_to_hub=False
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
# use this dataset to run the model faster (for testing purposes)
smallToken = tokenTrain["train"][:10]
smallToken = Dataset.from_dict(smallToken)
smallToken = smallToken.train_test_split(train_size=0.9, test_size=0.1)
smallToken

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})

In [18]:
#for gpt-2 training
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=smallToken["train"],
    eval_dataset=smallToken["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 9
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,2.0394,2.004087,0.164583,0.34375,0.197186,0.197186


***** Running Evaluation *****
  Num examples = 1
  Batch size = 1
Saving model checkpoint to bert-test/checkpoint-1
Configuration saved in bert-test/checkpoint-1/config.json
Model weights saved in bert-test/checkpoint-1/pytorch_model.bin
tokenizer config file saved in bert-test/checkpoint-1/tokenizer_config.json
Special tokens file saved in bert-test/checkpoint-1/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1, training_loss=2.039372444152832, metrics={'train_runtime': 4.9319, 'train_samples_per_second': 1.825, 'train_steps_per_second': 0.203, 'total_flos': 1365745565448.0, 'train_loss': 2.039372444152832, 'epoch': 0.89})

In [20]:
# pushes model to huggingface library after training

# trainer.push_to_hub(commit_message="Training complete")