In [None]:
!pip install transformers==4.28.0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install datasets
!pip install -U git+https://github.com/huggingface/accelerate.git

In [None]:
!pip install nlpaug

In [None]:
# Import libraries

import numpy as np
import os
import pandas as pd
import random
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig

In [None]:
df_main = pd.read_csv('/content/drive/MyDrive/NAEP_Comp/df_cleaned.csv')

In [None]:
df_1 = df_main[df_main['accession'] == 'VH525628']

In [None]:
!pip install contractions

In [None]:
import contractions
df_1['predict_from'] = df_1['predict_from'].fillna('N/A')
df_1['predict_from'] = df_1['predict_from'].apply(contractions.fix)

In [None]:
df_1['predict_from']

In [None]:
# List of unique accessions
unique_accessions = ['VH134067', 'VH139380', 'VH266015', 'VH266510', 'VH269384',
                     'VH271613', 'VH302907', 'VH304954', 'VH507804', 'VH525628']



# Dictionary to store the dataframes
dfs = {}

# Loop through the unique accessions
for accession in unique_accessions:
    # Create the dataframe name
    path = '/content/drive/MyDrive/NAEP_Comp/'
    df_name = 'df_' + accession

    # Read the CSV file into a dataframe
    df = pd.read_csv(path + df_name + '.csv')

    # Add the dataframe to the dictionary
    dfs[accession] = df

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NAEP_Comp/df_VH139380.csv')

In [None]:
df = df.dropna(subset=['parsed_xml_v1'])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['predict_from_onestepall'], df['assigned_score'], test_size=0.2, stratify=df['assigned_score'], random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, stratify = y_test, random_state=42)
X_test.shape, X_train.shape, X_valid.shape


In [None]:
print('Our training data has   ', len(X_train.index), ' rows.')
print('Our validation data has ', len(X_valid.index), ' rows.')
print('Our test data has       ', len(X_test.index), ' rows.')

In [None]:
y_train -= 1
y_test -= 1
y_valid -= 1

In [None]:
np.unique(y_train
          )

In [None]:
import numpy as np
import pandas as pd
import transformers
import torch
import csv

from datasets import Dataset,load_dataset, load_from_disk, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, AdamW
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import DataLoader

In [None]:
from sklearn.metrics import cohen_kappa_score
import os, sys, itertools, re

In [None]:
# Change model to pretrain here
MODEL = "google/electra-base-discriminator"

In [None]:
df = dfs['VH139380']

In [None]:
# Include columns that are important (features, labels, student_id)
df = df[["student_id", "predict_from", "score_to_predict"]].set_index("student_id").fillna("")
df['labels'] = df['score_to_predict'] - 1
df.head()

In [None]:
from datasets import Dataset as d1

In [None]:
from transformers import (ElectraForSequenceClassification,
                          ElectraTokenizerFast, EvalPrediction, InputFeatures,
                          Trainer, TrainingArguments, glue_compute_metrics)
from torch.utils.data import Dataset

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    preds = logits.argmax(axis=1)
    kappa = cohen_kappa_score(labels, preds, weights='quadratic')
    return {"cohen_kappa": kappa}

In [None]:
class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer

        # Tokenize the input
        self.tokenized_inputs = tokenizer(inputs, padding=True, max_length=80, truncation=True, add_special_tokens= True )   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
            token_type_ids=self.tokenized_inputs['token_type_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])      

In [None]:
def train(train_dataset, eval_dataset, test_indexes, name, model) :
  # AdamW Training
  training_args = TrainingArguments(
    output_dir="./models/model_electra",
    num_train_epochs=3,  # 1 (1 epoch gives slightly lower accuracy)
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_total_limit = 2,
    save_strategy = 'no',
    load_best_model_at_end=False  # Make sure all batches are of equal size
)
  # Instantiate the Trainer class
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      compute_metrics=None)
  trainer.train()
  pred, actual, _ = trainer.predict(eval_dataset)
  pred_labels = np.argmax(pred, axis=1)
  results_df = pd.DataFrame(index=test_indexes)
  results_df['indexes'] = test_indexes
  results_df['True Labels'] = actual + 1
  results_df['PredictedValue'] = pred_labels + 1
  results_df.to_csv('/content/drive/MyDrive/NAEP_Comp/ElectraLarge/' + name + '.csv')
  kappa_score = cohen_kappa_score(actual, pred_labels, weights='quadratic')
  return kappa_score, model




In [None]:
import nlpaug.augmenter.word as naw

def detect_minority_majority_classes(df, label_column):

    class_counts = df[label_column].value_counts()
    minority_classes = class_counts[class_counts < class_counts.max()].index.tolist()
    majority_class = class_counts.idxmax()
    return minority_classes, majority_class

def augment_minority_class_text(df, text_column, label_column):
    augmented_texts = []
    aug = naw.RandomWordAug(action="swap")
    minority_classes, majority_class = detect_minority_majority_classes(df, label_column)
    print(df[label_column].value_counts())
    
    for minority_class in minority_classes:
        # Filter the dataframe to get only the minority class rows
        minority_df = df[df[label_column] == minority_class]
        majority_df = df[df[label_column] == majority_class]
        minority_count = len(minority_df)
        majority_count = len(majority_df)
        
        # Check if augmentation is required based on class imbalance
        if minority_count >= 0.6* majority_count:
            continue

        # Calculate the number of augmentations required
        num_augmentations = int(0.6 * majority_count) - minority_count
        
        # Augment the text of the minority class
        while num_augmentations > 0:
            for text in minority_df[text_column]:
                augmented_text = aug.augment(text)
                if augmented_text:
                    augmented_texts.append((augmented_text[0], minority_class))  # Append augmented text with the minority class label
                    num_augmentations -= 1
                    if num_augmentations == 0:
                        break

    # Create a new dataframe with augmented texts
    augmented_df = pd.DataFrame(augmented_texts, columns=[text_column, label_column])
    
    # Concatenate the augmented dataframe with the original dataframe
    augmented_df = pd.concat([df, augmented_df], ignore_index=True)
    print(augmented_df[label_column].value_counts())
    return augmented_df



In [None]:
def preprocess(text):
    text=text.lower()
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    #Replace &amp, &lt, &gt with &,<,> respectively
    text=text.replace(r'&amp;?',r'and')
    text=text.replace(r'&lt;',r'<')
    text=text.replace(r'&gt;',r'>')
    #remove hashtag sign
    #text=re.sub(r"#","",text)   
    #remove mentions
    text = re.sub(r"(?:\@)\w+", '', text)
    #text=re.sub(r"@","",text)
    #remove non ascii chars
    text=text.encode("ascii",errors="ignore").decode()
    #remove some puncts (except . ! ?)
    text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
    text=re.sub(r'[!]+','!',text)
    text=re.sub(r'[?]+','?',text)
    text=re.sub(r'[.]+','.',text)
    text=re.sub(r"'","",text)
    text=re.sub(r"\(","",text)
    text=re.sub(r"\)","",text)
    
    text=" ".join(text.split())
    return text

In [None]:
def train_model(df, name) :
  # Convert to dataset format
  # Include columns that are important (features, labels, student_id)

  df = df[["student_id", "predict_from", "score_to_predict"]].fillna("")
  df['predict_from'] = df['predict_from'].apply(preprocess)
  df['predict_from'] = df['predict_from'].fillna('N/A')
  df['predict_from'] = df['predict_from'].apply(contractions.fix)
  df['labels'] = df['score_to_predict'] - 1
  df_train, df_valid = train_test_split(df, test_size = 0.2, stratify = df['labels'], random_state=11 )
  df_balanced = augment_minority_class_text(df_train, 'predict_from','labels')
  dataset_train = d1.from_pandas(df_balanced, preserve_index=False)
  dataset_valid = d1.from_pandas(df_valid, preserve_index=False)
  test_indexes = dataset_valid['student_id']
  model = ElectraForSequenceClassification.from_pretrained(MODEL, num_labels=df['labels'].nunique())
  model.cuda()
  tokenizer = ElectraTokenizerFast.from_pretrained(MODEL, do_lower_case=True)
  train_dataset = TrainerDataset(dataset_train["predict_from"],
                               dataset_train["labels"], tokenizer)
  eval_dataset = TrainerDataset(dataset_valid["predict_from"],
                              dataset_valid["labels"], tokenizer) 
  training_args = TrainingArguments(
    output_dir="./models/model_electra",
    num_train_epochs=5,  # 1 (1 epoch gives slightly lower accuracy)
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_total_limit = 2,
    save_strategy = 'no',
    load_best_model_at_end=False ) # Make sure all batches are of equal size)
  # Instantiate the Trainer class
  trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics)
  trainer.train()
  pred, actual, _ = trainer.predict(eval_dataset)
  pred_labels = np.argmax(pred, axis=1)
  results_df = pd.DataFrame(index=test_indexes)
  results_df['indexes'] = test_indexes
  results_df['True Labels'] = actual + 1
  results_df['PredictedValue'] = pred_labels + 1
  results_df.to_csv('/content/drive/MyDrive/NAEP_Comp/ElectraLarge/' + name + '.csv')
  kappa_score = cohen_kappa_score(actual, pred_labels, weights='quadratic')
  model_save_name = name + '_b_electa.pth'
  path = model_save_name
  path2 = '/content/drive/MyDrive/NAEP_Comp/' + path
  torch.save(model.state_dict(), path)
  torch.save(model.state_dict(), path2)
  return kappa_score



In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def train_electra(df, name) :
  # Convert to dataset format
  # Include columns that are important (features, labels, student_id)
  MODEL2 =  "google/electra-small-discriminator"
  df = df[["student_id", "predict_from", "score_to_predict"]].fillna("")
  df['predict_from'] = df['predict_from'].apply(preprocess)
  df['labels'] = df['score_to_predict'] - 1
  df_train, df_valid = train_test_split(df, test_size = 0.2, stratify = df['labels'], random_state=11 )
  df_balanced = augment_minority_class_text(df_train, 'predict_from','labels')
  dataset_train = d1.from_pandas(df_train, preserve_index=False)
  dataset_valid = d1.from_pandas(df_valid, preserve_index=False)
  test_indexes = dataset_valid['student_id']
  model = ElectraForSequenceClassification.from_pretrained(MODEL2, num_labels=df['labels'].nunique())
  tokenizer = ElectraTokenizerFast.from_pretrained(MODEL2, do_lower_case=True)
  train_dataset = TrainerDataset(dataset_train["predict_from"],
                               dataset_train["labels"], tokenizer)
  eval_dataset = TrainerDataset(dataset_valid["predict_from"],
                              dataset_valid["labels"], tokenizer) 
  training_args = TrainingArguments(
    output_dir="./models/model_electra2",
    num_train_epochs=10,  # 1 (1 epoch gives slightly lower accuracy)
    overwrite_output_dir=True,
    evaluation_strategy="no",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_total_limit = 2,
    save_strategy = 'no',
    load_best_model_at_end=True ) # Make sure all batches are of equal size)
  # Instantiate the Trainer class
  trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=None)
  trainer.train()
  pred, actual, _ = trainer.predict(eval_dataset)
  pred_labels = np.argmax(pred, axis=1)
  print(pred_labels)
  results_df = pd.DataFrame(index=test_indexes)
  results_df['indexes'] = test_indexes
  results_df['True Labels'] = actual + 1
  results_df['PredictedValue'] = pred_labels + 1
  results_df.to_csv('/content/drive/MyDrive/NAEP_Comp/ElectraLarge/ElectraLarge/' + name + '.csv')
  kappa_score = cohen_kappa_score(actual, pred_labels, weights='quadratic')
  return kappa_score



In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

In [None]:
def tokenize(batch):
    return tokenizer(batch["predict_from"], padding=True, truncation=True, max_length=100, add_special_tokens=True)

In [None]:
def extract_first_string(value):
    if isinstance(value, list):
        if len(value) > 0 and isinstance(value[0], str):
            return value[0]   
    return value

In [None]:
def train_roberta_large(df, name) :
  # Convert to dataset format
  # Include columns that are important (features, labels, student_id)
  MODEL =  "roberta-base"
  df = df[["student_id", "predict_from", "score_to_predict"]].fillna("")
  df['predict_from'] = df['predict_from'].apply(preprocess)
  df['label'] = df['score_to_predict'] - 1
  df_train, df_valid = train_test_split(df, test_size = 0.2, stratify = df['label'], random_state=11 )
  df_balanced = augment_minority_class_text(df_train, 'predict_from','label')
  df_balanced['predict_from'] = df_balanced['predict_from'].apply(extract_first_string)
  
  dataset_train = d1.from_pandas(df_balanced, preserve_index=False)
  dataset_valid = d1.from_pandas(df_valid, preserve_index=False)
  test_indexes = dataset_valid['student_id']
  model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=df['label'].nunique())
  tokenizer = RobertaTokenizer.from_pretrained(MODEL, do_lower_case=True)
  train_dataset = dataset_train.map(tokenize, batched=True, batch_size=len(dataset_train))
  val_dataset = dataset_valid.map(tokenize, batched=True, batch_size=len(dataset_valid))
  train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
  val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

  training_args = TrainingArguments(
    output_dir="./models/model_electra2",
    num_train_epochs=10,  # 1 (1 epoch gives slightly lower accuracy)
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_total_limit = 2,
    save_strategy = 'epoch',
    load_best_model_at_end=False ) # Make sure all batches are of equal size)
  # Instantiate the Trainer class
  trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics)
  trainer.train()
  pred, actual, _ = trainer.predict(val_dataset)
  pred_labels = np.argmax(pred, axis=1)
  print(pred_labels)
  results_df = pd.DataFrame(index=test_indexes)
  results_df['indexes'] = test_indexes
  results_df['True Labels'] = actual + 1
  results_df['PredictedValue'] = pred_labels + 1
  results_df.to_csv('/content/drive/MyDrive/NAEP_Comp/ElectraLarge/ElectraLarge/' + name + '.csv')
  kappa_score = cohen_kappa_score(actual, pred_labels, weights='quadratic')
  return kappa_score

In [None]:
df = dfs['VH525628']

In [None]:
score = train_model(df_1, 'test_electra')
print(score)

In [None]:
score

In [None]:
results = {}
for i, df in enumerate(dfs):
    name = unique_accessions[i]
    df = dfs[name]
    print(name)
    score = train_electra(df, name)
    results[name] = [score]  # Store score as a list
    print(score)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv('/content/drive/MyDrive/NAEP_Comp/ElectraLarge/ElectraLarge/Results_Cohen_' + str(name) + '.csv', index=False)