<a href="https://colab.research.google.com/github/rubenvangenugten/autobiographical_interview_scoring/blob/main/automated_internal_external_scoring_CV_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is used to train with leave-one-dataset out crossvalidation.

All code for model training, evaluation, preparing data for training, and related matters has been copied or adapted from the huggingface [documentation](https://huggingface.co/docs/transformers/index) and the example [library](https://github.com/huggingface/notebooks/blob/master/transformers_doc/training.ipynb)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install transformers==4.6.0

In [None]:
pip install datasets

In [None]:
# -*- coding: utf-8 -*-

import random
import pandas as pd
import os
import numpy as np
from collections import Counter
import tensorflow as tf
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_rows', 20) # Display 20 rows

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred[:2]
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def findClosest(num, collection):
   return min(collection,key=lambda x:abs(x-num))

In [None]:
# Read in data.
# This data has one sentence per row, so that bert can be trained.
 
allDat = pd.read_csv('/content/drive/MyDrive/automated_internal_external_scoring_CV_example/synthetic_data_training_repeat.csv')


In [None]:
allDat = allDat[- allDat.Type.isnull()] # remove rows without data

allDat["percentInt_sentence"] = allDat.numInt_sentence/allDat.numTotal_sentence

percentageList = [0, .5, .75, 1]

allDat["closestPercentage"] = [findClosest(i, percentageList) for i in allDat.percentInt_sentence]


# The code below uses "Type" as the variable that has the labels.
# so assign closest percentage to type.

# so, the type column that previously contained 'external', 'intenral', and 'mixed'
# will now be replaced with the items in percentageList

allDat.loc[:,["Type"]] = allDat["closestPercentage"]

# now, since the transformer expects integers, recast the percentage list to integer choices

conditions = [
    (allDat["closestPercentage"] == 0),
    (allDat["closestPercentage"] == .5),
    (allDat["closestPercentage"] == .75),
    (allDat["closestPercentage"] == 1)
]
choices = [0, 1, 2, 3]

allDat["Type"] = np.select(conditions, choices)

# can also remove sentneces that have more than 8 details, since that seems excessively 
# long (e.g. a full narrative)

allDat = allDat[allDat.numTotal_sentence <= 8]


In [None]:
os.chdir('/content/drive/MyDrive/automated_internal_external_scoring_CV_example/')

In [None]:
##### start the process of leave-one-dataset-out validation

def runDatasetCV(testDatasetName):
    
    # now, split up data into training and test.

    trainData = allDat[allDat.study != testDatasetName]
    testData = allDat[allDat.study == testDatasetName]
    
    # remove opinions and description tasks, since those have different scoring procedures
    trainData = trainData[trainData.task != 'Description']
    trainData = trainData[trainData.task != 'Opinion']

    # now, upsample the training data to the highest value
    
    training_num_sentences = Counter(trainData.Type)
    max_training_num_sentences = max(training_num_sentences.values())
      
    types = trainData.Type.unique()

    for sentenceType in types:
        thisType = sentenceType
        trainData_thisType = trainData[trainData.Type == thisType]
        
        upsampling_how_many_add = max_training_num_sentences - trainData_thisType.shape[0]
        
        addedData = trainData_thisType.sample(n = upsampling_how_many_add,
                                              replace = True,
                                              random_state = 2021,
                                              axis = 0)
        trainData = trainData.append(addedData)
        
    training_num_sentences_afterResample = Counter(trainData.Type)
    
    # data is now upsampled    
    # now, make sure data are character
    
    trainData.loc[:,'sentence'] = trainData.loc[:,'sentence'].astype('str')
    testData.loc[:,'sentence'] = testData.loc[:,'sentence'].astype('str')
                
    
    #  create lists that bert can work with. 
    #  training set:
    
    texts = []
    labels = []
    
    for row in range(trainData.shape[0]):
        
        temp = trainData.iloc[row, trainData.columns.get_loc("sentence")]
        temp = str(temp) # strip name of dataframe, then turn into string
        texts.append(temp)
        
        temp2 = trainData.iloc[row, trainData.columns.get_loc("Type")]
            
        labels.append(temp2)
        
    # testing set:
        
    texts_test = []
    labels_test = []
    
    for row2 in range(testData.shape[0]):
        
        temp_test = testData.iloc[row2, testData.columns.get_loc("sentence")]
        temp_test = str(temp_test) # strip name of dataframe, then turn into string
        texts_test.append(temp_test)
        
        temp_test2 = testData.iloc[row2, testData.columns.get_loc("Type")]
    
        labels_test.append(temp_test2)
    
    # now shuffle the  training labels/lists together so that bert 
    # doesnt see them in any particular order
    
    temp = list(zip(texts, labels))
    random.shuffle(temp)
    texts, labels = zip(*temp)
    
    # just for naming:
        
    test_texts = texts_test
    test_labels = labels_test
    train_texts = texts
    train_labels = labels
    
    
    # now, split for training/val
    
    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
    
    train_texts = list(train_texts) # unsure why train list is a tple, but cast to list
    train_labels = list(train_labels) # unsure why train list is a tple, but cast to list
    
    
    # load tokenizer
    
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')    
    
    # encode text into something that bert can work with
    
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)
    
    # create dataset so that bert can work with it
    
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        train_labels
    ))
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        val_labels
    ))
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        test_labels
    ))
    
    # set up model
    
    model_outputDir = './results_CVTraining_' + testDatasetName + 'Testing'
    
    training_args = TFTrainingArguments(
        output_dir=model_outputDir,          # output directory
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
    )
    
    with training_args.strategy.scope():
        model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
    
    trainer = TFTrainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset             # validation dataset
    )
    
    # now train it!
    
    trainer.train()
    
    # evaluate it to see how it did on the validations set
    trainer.evaluate()
    
    # let's see how it does on test data.
    
    preds_test = trainer.predict(test_dataset)
    
    
    # let's calculate performance metrics   
    # compute_metrics(preds_test)
    
    logits, labels = preds_test[:2]
    
    predictions = np.argmax(logits, axis=-1)    
    testData[['predictions']] = predictions

    #confusion_matrix(labels, predictions)
            
    
    # now, calculate the amount of internal content per line
    # as well as the amount of external content
    
    testData[['numInt_preds']] = 0
    testData[['numExt_preds']] = 0
    
    # now loop through each row and add in the coutns
    
    for row in range(testData.shape[0]):
        
        predictionType_thisIter = testData.iloc[row, testData.columns.get_loc("predictions")]
       
        internalLocation = testData.columns.get_loc("numInt_preds")
        externalLocation = testData.columns.get_loc("numExt_preds")
    
        
        if predictionType_thisIter == 0:
            testData.iloc[row, externalLocation] = \
                testData.iloc[row, testData.columns.get_loc("sentenceWordCount")]
        
        if predictionType_thisIter == 1:
            numTotalDetails = testData.iloc[row, testData.columns.get_loc("sentenceWordCount")]
            halfDetails = numTotalDetails/2
            testData.iloc[row, externalLocation] = halfDetails
            testData.iloc[row, internalLocation] = halfDetails
    
        if predictionType_thisIter == 2:
            numTotalDetails = testData.iloc[row, testData.columns.get_loc("sentenceWordCount")]        
            testData.iloc[row, externalLocation] = numTotalDetails/4
            testData.iloc[row, internalLocation] = numTotalDetails*(3/4)
                
        if predictionType_thisIter == 3:
            testData.iloc[row, internalLocation] = \
                testData.iloc[row, testData.columns.get_loc("sentenceWordCount")]
      
    # write out the predictions for the test set. save model.
    testData_outputName = 'testData_' + testDatasetName + '_CVTraining.csv'

    testData.to_csv(testData_outputName)
    
    save_directory = os.getcwd() + '/bert_finetuned_oversampled/' + '_CVTraining_' + testDatasetName
    
    trainer.save_model(save_directory)
    

In [None]:
allStudies = allDat.study.unique()

for studyName in allStudies:
    runDatasetCV(studyName)