# BERT Training and Evaluation
Keep in mind that this code only works with CUDA.

# set up

In [1]:
#!pip install pandas
# !pip install simpletransformers
#!pip install -U scikit-learn scipy matplotlib
# !pip install torchvision 
# !pip install setuptools==59.5.0
# !pip install ipywidgets
# !pip install --upgrade torch torchvision

In [1]:
import torch
torch.cuda.is_available()

False

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel
import os
import gc
import torch
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
import warnings

warnings.filterwarnings('ignore')

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Read the data from CSV file
data = pd.read_csv('data.csv')

# Select the relevant columns
datacheck = data[['Author Keywords', 'Decision']]

# Define the label list
class_list = ['Included', 'Rejected', 'Not Sure']

# Convert labels to numerical values
label_map = {label: i for i, label in enumerate(class_list)}
datacheck['label'] = datacheck['Decision'].map(label_map)

dataframes = []

# Splitting the datacheck dataframe into 10 dataframes of 500 entries each
total_entries = len(datacheck)
num_dataframes = 11
entries_per_dataframe = 500

use_cuda = torch.cuda.is_available()

for i in range(num_dataframes):
    start_index = i * entries_per_dataframe
    end_index = (i + 1) * entries_per_dataframe

    # Adjusting the end_index for the last dataframe to handle any remaining entries
    if i == num_dataframes - 1:
        end_index = total_entries

    df_slice = datacheck.iloc[start_index:end_index, :]
    dataframes.append(df_slice)

    # Create a ClassificationModel for each feature column
model = ClassificationModel(
    "bert", "bert-base-uncased",
    num_labels=len(class_list),
    use_cuda=use_cuda,
    args={
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "fp16": use_cuda,
        "num_train_epochs": 1,  # Increase the number of epochs for demonstration purposes
        "use_multiprocessing": False,  # Set this to False if you face issues with multiprocessing
        "learning_rate": 1e-5,  # Adjust the learning rate if needed
    },
)
    
models = {}
evaluation_scores = {}
for column in ['Author Keywords']:


    
    # Train the model using the specific feature column
    mcc_scores = []
    f1_scores = []
    acc_scores = []
    eval_losses = []
    
    for epoch in range(10):
            # Take a smaller sample of the data (500 entries in this example)
        datacheck_sampled = datacheck.sample(n=500, random_state=42)
#         print(datacheck_sampled)
        # Split the data into training and testing sets
        train_df, test_df = train_test_split(datacheck_sampled, test_size=0.3, shuffle=True, random_state=42)
    
        model.train_model(train_df[[column, 'label']])
        
        # Evaluate the model at each epoch
        column_test_df = test_df[[column, 'label']]
        result, model_outputs, wrong_predictions = model.eval_model(column_test_df, f1=f1_multiclass, acc=accuracy_score)
        mcc_scores.append(result['mcc'])
        f1_scores.append(result['f1'])
        acc_scores.append(result['acc'])
        eval_losses.append(result['eval_loss'])
        
        print(f"Results for {column} - Epoch {epoch + 1}:")
        print(result)
        print()
        
        # Plot the evaluation scores for each epoch
        epochs = list(range(1, epoch + 2))
        x_ticks = np.arange(1, epoch + 2)

        plt.figure(figsize=(12, 6))
        plt.subplot(2, 2, 1)
        plt.bar(x_ticks, mcc_scores, align='center')
        plt.xticks(x_ticks, epochs)
        plt.xlabel('Batch')
        plt.ylabel('MCC')
        plt.title(f"MCC Scores for {column}")

        plt.subplot(2, 2, 2)
        plt.bar(x_ticks, f1_scores, align='center')
        plt.xticks(x_ticks, epochs)
        plt.xlabel('Batch')
        plt.ylabel('F1')
        plt.title(f"F1 Scores for {column}")

        plt.subplot(2, 2, 3)
        plt.bar(x_ticks, acc_scores, align='center')
        plt.xticks(x_ticks, epochs)
        plt.xlabel('Batch')
        plt.ylabel('Accuracy')
        plt.title(f"Accuracy Scores for {column}")

        plt.subplot(2, 2, 4)
        plt.plot(x_ticks, eval_losses, marker='o')
        plt.xticks(x_ticks, epochs)
        plt.xlabel('Batch')
        plt.ylabel('Eval Loss')
        plt.title(f"Evaluation Loss for {column}")

        plt.tight_layout()
        plt.show()
    
    # Store the trained model and evaluation scores
    models[column] = model
    evaluation_scores[column] = {
        'mcc': mcc_scores,
        'f1': f1_scores,
        'acc': acc_scores,
        'eval_loss': eval_losses
    }
    
    del model
    torch.cuda.empty_cache()
    gc.collect()


: 

: 