In [None]:
!pip install transformers==4.28.0

In [None]:
!pip install transformers
!pip install datasets
!pip install -U git+https://github.com/huggingface/accelerate.git

In [None]:
!pip install nlpaug

In [None]:
!pip install sacremoses

In [None]:
# Import libraries

import numpy as np
import os
import pandas as pd
import random
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig

In [None]:
# List of unique accessions
unique_accessions = ['VH134067', 'VH139380', 'VH266015', 'VH266510', 'VH269384',
                     'VH271613', 'VH302907', 'VH304954', 'VH507804', 'VH525628']



# Dictionary to store the dataframes
dfs = {}

# Loop through the unique accessions
for accession in unique_accessions:
    # Create the dataframe name
    path = '/content/drive/MyDrive/NAEP_Comp/'
    df_name = 'df_' + accession

    # Read the CSV file into a dataframe
    df = pd.read_csv(path + df_name + '.csv')

    # Add the dataframe to the dictionary
    dfs[accession] = df

In [None]:
df = dfs['VH525628']

In [None]:
df = df.dropna(subset=['parsed_xml_v1'])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['predict_from_onestepall'], df['assigned_score'], test_size=0.2, stratify=df['assigned_score'], random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, stratify = y_test, random_state=42)
X_test.shape, X_train.shape, X_valid.shape


In [None]:
print('Our training data has   ', len(X_train.index), ' rows.')
print('Our validation data has ', len(X_valid.index), ' rows.')
print('Our test data has       ', len(X_test.index), ' rows.')

In [None]:
y_train -= 1
y_test -= 1
y_valid -= 1

In [None]:
np.unique(y_train
          )

In [None]:
import numpy as np
import pandas as pd
import transformers
import torch
import csv

from datasets import Dataset,load_dataset, load_from_disk, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, AdamW
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

In [None]:
# Change model to pretrain here
MODEL = "distilbert-base-uncased"

In [None]:
df = dfs['VH139380']

In [None]:
# Include columns that are important (features, labels, student_id)
df = df[["student_id", "predict_from", "score_to_predict"]].set_index("student_id").fillna("")
df['labels'] = df['score_to_predict'] - 1
df.head()

In [None]:
# Convert to dataset format
dataset = Dataset.from_pandas(df, preserve_index=False)
dataset = dataset.train_test_split(test_size=0.1, seed=11)
dataset

In [None]:
# Create model and tokenizer
# Make sure the num_labels argument matches the question (it will usually be 2, for correct/incorrect)
# Some questions may require more than one model (for more than one written section)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)
Path = '/content/VH139380_b_distilbert.pth'
model.load_state_dict(torch.load(Path))
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
# Tokenize the data
model.resize_token_embeddings(len(tokenizer))

def tokenize_function(examples):
    return tokenizer(examples["predict_from"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
model.eval()

In [None]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=2,
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end=False
                                  )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=None,
)
pred, actual, _ = trainer.predict(tokenized_datasets['test'])

In [None]:
pred_labels = np.argmax(pred, axis=1)


In [None]:
pred_labels

In [None]:
cohen_kappa_score(actual, pred_labels, weights='quadratic')

In [None]:
import nlpaug.augmenter.word as naw

def detect_minority_majority_classes(df, label_column):

    class_counts = df[label_column].value_counts()
    minority_classes = class_counts[class_counts < class_counts.max()].index.tolist()
    majority_class = class_counts.idxmax()
    return minority_classes, majority_class

def augment_minority_class_text(df, text_column, label_column):
    augmented_texts = []
    aug = naw.SynonymAug(aug_src='wordnet',aug_max=2)
    minority_classes, majority_class = detect_minority_majority_classes(df, label_column)
    print(df[label_column].value_counts())
    
    for minority_class in minority_classes:
        # Filter the dataframe to get only the minority class rows
        minority_df = df[df[label_column] == minority_class]
        majority_df = df[df[label_column] == majority_class]
        minority_count = len(minority_df)
        majority_count = len(majority_df)
        
        # Check if augmentation is required based on class imbalance
        if minority_count >= 0.6* majority_count:
            continue

        # Calculate the number of augmentations required
        num_augmentations = int(0.6 * majority_count) - minority_count
        
        # Augment the text of the minority class
        while num_augmentations > 0:
            for text in minority_df[text_column]:
                augmented_text = aug.augment(text)
                if augmented_text:
                    augmented_texts.append((augmented_text[0], minority_class))  # Append augmented text with the minority class label
                    num_augmentations -= 1
                    if num_augmentations == 0:
                        break

    # Create a new dataframe with augmented texts
    augmented_df = pd.DataFrame(augmented_texts, columns=[text_column, label_column])
    
    # Concatenate the augmented dataframe with the original dataframe
    augmented_df = pd.concat([df, augmented_df], ignore_index=True)
    print(augmented_df[label_column].value_counts())
    return augmented_df



In [None]:
!pip install contractions

In [None]:
import contractions

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    preds = logits.argmax(axis=1)
    kappa = cohen_kappa_score(labels, preds, weights='quadratic')
    return {"cohen_kappa": kappa}

In [None]:
from sklearn.metrics import cohen_kappa_score
import os, sys, itertools, re

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
def tokenize_function(examples):
    return tokenizer(examples["text_blob"], padding="max_length", truncation=True)

In [None]:
def preprocess(text):
    text=text.lower()
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    #Replace &amp, &lt, &gt with &,<,> respectively
    text=text.replace(r'&amp;?',r'and')
    text=text.replace(r'&lt;',r'<')
    text=text.replace(r'&gt;',r'>')
    #remove hashtag sign
    text=re.sub(r"#","",text)   
    #remove mentions
    text = re.sub(r"(?:\@)\w+", '', text)
    #text=re.sub(r"@","",text)
    #remove non ascii chars
    text=text.encode("ascii",errors="ignore").decode()
    #remove some puncts (except . ! ?)
    text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
    text=re.sub(r'[!]+','!',text)
    text=re.sub(r'[?]+','?',text)
    text=re.sub(r'[.]+','.',text)
    text=re.sub(r"'","",text)
    text=re.sub(r"\(","",text)
    text=re.sub(r"\)","",text)
    
    text=" ".join(text.split())
    return text

In [None]:
def train(train_dataset,eval_dataset, test_indexes, name, model) :
  # AdamW Training
  training_args = TrainingArguments(output_dir="test_trainer",
                                    logging_strategy="epoch",
                                    evaluation_strategy="epoch",
                                    per_device_train_batch_size=32,
                                    per_device_eval_batch_size=32,
                                    num_train_epochs=5,
                                    save_total_limit = 2,
                                    save_strategy = 'epoch',
                                    load_best_model_at_end=True
                                    )
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      compute_metrics=compute_metrics,
  )
  trainer.train()
  pred, actual, _ = trainer.predict(eval_dataset)
  pred_labels = np.argmax(pred, axis=1)
  results_df = pd.DataFrame(index=test_indexes)
  results_df['indexes'] = test_indexes
  results_df['True Labels'] = actual + 1
  results_df['PredictedValue'] = pred_labels + 1
  results_df.to_csv('/content/drive/MyDrive/NAEP_Comp/DistillBert1/' + name + '.csv')
  kappa_score = cohen_kappa_score(actual, pred_labels, weights='quadratic')
  return kappa_score




In [None]:
df_main = pd.read_csv('/content/drive/MyDrive/NAEP_Comp/df_cleaned.csv')

In [None]:
df_1 = df_main[df_main['accession'] == 'VH525628']

In [None]:
def train_model(df, name) :
  # Convert to dataset format
  # Include columns that are important (features, labels, student_id)
  df = df[["student_id", "text_blob", "score_to_predict"]].fillna("")
  df['text_blob'] = df['text_blob'].apply(preprocess)
  df['text_blob'] = df['text_blob'].fillna('N/A')
  df['text_blob'] = df['text_blob'].apply(contractions.fix)
  df['labels'] = df['score_to_predict'] - 1
  df_train, df_valid = train_test_split(df, test_size = 0.2, stratify = df['labels'], random_state=11 )
  df_balanced = augment_minority_class_text(df_train, 'text_blob','labels')
  dataset_train = Dataset.from_pandas(df_balanced, preserve_index=False)
  dataset_valid = Dataset.from_pandas(df_valid, preserve_index=False)
  test_indexes = dataset_valid['student_id']
  model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=df['labels'].nunique())
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
  model.resize_token_embeddings(len(tokenizer))
  train_tokenized =dataset_train.map(tokenize_function, batched=True)
  valid_tokenized = dataset_valid.map(tokenize_function, batched=True)
  
  score = train(train_tokenized,valid_tokenized, test_indexes, name, model=model)

  return score



In [None]:
train_model(df_1, 'test_distill1')

In [None]:
results = {}
for i, df in enumerate(dfs):
    name = unique_accessions[i]
    df = dfs[name]
    print(name)
    score = train_model(df, name)
    results[name] = [score]  # Store score as a list
    print(score)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv('//content/drive/MyDrive/NAEP_Comp/DistillBert/Results_Cohen_' + str(name) + '.csv', index=False)

In [None]:
# AdamW Training
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=2,
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end=False
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=None,
)
trainer.train()

In [None]:
# Get predictions for parts graded via model
pred, actual, _ = trainer.predict(tokenized_datasets['test'])

In [None]:
pred_labels = np.argmax(pred, axis=1)

In [None]:
pred_labels

In [None]:
# Compute the QWK using predictions on test data
cohen_kappa_score(actual, pred_labels, weights='quadratic')

In [None]:
# Save model
model_save_name = 'VH139380_b_distilbert.pth'
path = model_save_name
torch.save(model.state_dict(), path)