In [None]:
!pip install transformers
!pip install rsmtool

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer, BertModel, BertTokenizer, BertConfig
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.optim as optim
from tqdm import tqdm
from rsmtool.utils.metrics import quadratic_weighted_kappa, difference_of_standardized_means, standardized_mean_difference
from rsmtool.fairness_utils import get_fairness_analyses
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
class CustomLoss(nn.Module):
  def __init__(self):
    super(CustomLoss, self).__init__()

  def forward(self, predictions, targets):
    loss = torch.mean((predictions - targets) ** 2)
    return loss

In [None]:
def load_data(path):
  prompt_1 = pd.read_csv(path+'Prompt_1.csv')
  prompt_2 = pd.read_csv(path+'Prompt_2.csv')
  prompt_3 = pd.read_csv(path+'Prompt_3.csv')
  prompt_4 = pd.read_csv(path+'Prompt_4.csv')
  prompt_5 = pd.read_csv(path+'Prompt_5.csv')
  prompt_6 = pd.read_csv(path+'Prompt_6.csv')
  prompt_7 = pd.read_csv(path+'Prompt_7.csv')
  prompt_8 = pd.read_csv(path+'Prompt_8.csv')
  prompt_9 = pd.read_csv(path+'Prompt_9.csv')
  prompt_10 = pd.read_csv(path+'Prompt_10.csv')
  prompt_11 = pd.read_csv(path+'Prompt_11.csv')
  prompt_12 = pd.read_csv(path+'Prompt_12.csv')
  return [prompt_1, prompt_2, prompt_3, prompt_4, prompt_5, prompt_6, prompt_7, prompt_8, prompt_9, prompt_10, prompt_11, prompt_12]

def split_data(data, fold):
    kfold = KFold(n_splits=fold, shuffle=False)
    results = []
    for train_index, test_index in kfold.split(data):
        results.append((train_index, test_index))
    return results

def get_features(texts, labels):
  range_min = 1
  range_max = 6
  labels = np.asarray((labels-range_min)/(range_max-range_min))

  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
  input_ids = []
  attention_masks = []
  for text in texts:
    encoding = tokenizer.encode_plus(
               text,
               max_length = 512,
               add_special_tokens = True,
               padding = 'max_length',
               return_attention_mask = True,
               truncation = True,
               return_tensors = 'pt',
               )
    input_ids.append(encoding['input_ids'].squeeze())
    attention_masks.append(encoding['attention_mask'].squeeze())
  return  input_ids, attention_masks, labels

def get_data_loader(input_ids, attention_masks, labels, batch_size):
  labels = torch.tensor(labels.reshape(-1))
  input_ids = torch.stack(input_ids)
  attention_masks = torch.stack(attention_masks)
  dataset = TensorDataset(input_ids, attention_masks, labels)
  dataloader = DataLoader(
               dataset,
               sampler = RandomSampler(dataset),
               batch_size = batch_size
               )
  return dataloader

def accuracy_evaluation(y_pred, y_test):
    qwk = quadratic_weighted_kappa(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    pearson_score = pearsonr(y_test, y_pred).statistic
    return qwk, mae, pearson_score

def fairness_evaluation(y_pred, y_test, demo_attribute):
    df = pd.DataFrame({"True_Score":y_test, "Prediction_Score":y_pred, "Demo":demo_attribute})
    results = get_fairness_analyses(df, group="Demo", system_score_column="Prediction_Score", human_score_column="True_Score")[1].values()[3]
    population_y_true_observed_sd = np.std(y_test)
    population_y_true_observed_mn = np.mean(y_test)
    population_y_pred_sd = np.std(y_pred)
    population_y_pred_mn = np.mean(y_pred)
    y_test_demo_0 = y_test[np.where(demo_attribute==0)]
    y_test_demo_1 = y_test[np.where(demo_attribute==1)]
    y_pred_demo_0 = y_pred[np.where(demo_attribute==0)]
    y_pred_demo_1 = y_pred[np.where(demo_attribute==1)]
    SMD_0 = difference_of_standardized_means(y_test_demo_0, y_pred_demo_0, population_y_true_observed_mn, population_y_pred_mn, population_y_true_observed_sd, population_y_pred_sd)
    SMD_1 = difference_of_standardized_means(y_test_demo_1, y_pred_demo_1, population_y_true_observed_mn, population_y_pred_mn, population_y_true_observed_sd, population_y_pred_sd)
    diff_mae = mean_absolute_error(y_test_demo_1, y_pred_demo_1) - mean_absolute_error(y_test_demo_0, y_pred_demo_0)
    scores = pd.DataFrame({"SMD_0":[SMD_0], "SMD_1":[SMD_1], "diff_mae":[diff_mae]})
    return results, scores

def covert_label(y_pred):
  range_min = 1
  range_max = 6
  y_orig = [(score*(range_max-range_min)+range_min) for score in y_pred]
  return y_orig

def get_data_loader_for_test(input_ids, attention_masks, batch_size):
  input_ids = torch.stack(input_ids)
  attention_masks = torch.stack(attention_masks)
  dataset = TensorDataset(input_ids, attention_masks)
  dataloader = DataLoader(
               dataset,
               sampler = SequentialSampler(dataset),
               batch_size = batch_size
               )
  return dataloader

def preprocessing(texts):
  res = []
  stop_words = set(stopwords.words('english'))
  for text in texts:
    tokens = word_tokenize(text)
    filtered_tokens = [w for w in tokens if not w.lower() in stop_words]
    res.append(' '.join(filtered_tokens))
  return np.array(res)

In [None]:
class BertThreeLayer(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
    self.linear = nn.Linear(768, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids, attention_mask)
    hidden_states = outputs.hidden_states
    cls_embedding = hidden_states[3][:, 0, :]
    regression_output = self.linear(cls_embedding)
    final_output = self.sigmoid(regression_output)
    return final_output

In [None]:
def train(model, optimizer, criterion, train_dataloader, dev_dataloader, epochs, gpu):
  best_qwk = float('-inf')
  for i in range(0, epochs):
    total_loss = 0
    count = 0
    model.train()
    print("Epoch " + str(i+1))
    for it, (input_ids, attention_masks, labels) in tqdm(enumerate(train_dataloader)):
      model.zero_grad()
      cuda_input_ids = input_ids.cuda(gpu)
      cuda_attention_masks = attention_masks.cuda(gpu)
      cuda_labels = labels.cuda(gpu)

      outputs = model(cuda_input_ids, cuda_attention_masks)

      loss = criterion(outputs.squeeze(), cuda_labels)
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
      count += 1

    qwk = evaluate(model, criterion, dev_dataloader, gpu)
    if best_qwk < qwk:
        best_qwk = qwk
        torch.save(model.state_dict(), '')
    print("Epoch {} complete, train loss: {}, dev qwk: {}".format(i+1, total_loss/count, qwk))

In [None]:
def evaluate(model, criterion, dev_dataloader, gpu):
  model.eval()
  y_pred = []
  y_true = []
  with torch.no_grad():
    for input_ids, attention_masks, labels in dev_dataloader:
      cuda_input_ids = input_ids.cuda(gpu)
      cuda_attention_masks = attention_masks.cuda(gpu)
      cuda_labels = labels.cuda(gpu)

      outputs = model(cuda_input_ids, cuda_attention_masks)

      results = outputs.squeeze(-1)
      results = results.detach().cpu().numpy()
      for result in results:
        y_pred.append(result)

      labels = cuda_labels.detach().cpu().numpy()
      for label in labels:
        y_true.append(label)
  qwk = quadratic_weighted_kappa(y_true, y_pred)
  return qwk

In [None]:
def predict(model, dataloader, gpu):
  model.eval()
  res = []
  with torch.no_grad():
    for input_ids, attention_masks in dataloader:
      cuda_input_ids = input_ids.cuda(gpu)
      cuda_attention_masks = attention_masks.cuda(gpu)

      outputs = model(cuda_input_ids, cuda_attention_masks)
      results = outputs.squeeze(-1)
      results = results.detach().cpu().numpy()
      for result in results:
        res.append(result)
  return np.array(res)

In [None]:
def run_experiment(seed):
  df = pd.DataFrame(columns=["prompt", "fold", "quadratic_weighted_kappa", "mean_absolute_error", "pearson_correlation_coefficient",
                            "OSA_gender", "OSA_gender_p_value", "OSD_gender", "OSD_gender_p_value", "CSD_gender", "CSD_gender_p_value", "SMD_1_gender", "SMD_0_gender", "MAED_gender",
                            "OSA_Economically_disadvantaged", "OSA_Economically_disadvantaged_p_value", "OSD_Economically_disadvantaged", "OSD_Economically_disadvantaged_p_value", "CSD_Economically_disadvantaged", "CSD_Economically_disadvantaged_p_value", "SMD_1_Economically_disadvantaged", "SMD_0_Economically_disadvantaged", "MAED_Economically_disadvantaged",
                            "OSA_Disability", "OSA_Disability_p_value", "OSD_Disability", "OSD_Disability_p_value", "CSD_Disability", "CSD_Disability_p_value", "SMD_1_Disability", "SMD_0_Disability", "MAED_Disability",
                            "OSA_English_Language_Learner", "OSA_English_Language_Learner_p_value", "OSD_English_Language_Learner", "OSD_English_Language_Learner_p_value", "CSD_English_Language_Learner", "CSD_English_Language_Learner_p_value", "SMD_1_English_Language_Learner", "SMD_0_English_Language_Learner", "MAED_English_Language_Learner",
                            "OSA_Race", "OSA_Race_p_value", "OSD_Race", "OSD_Race_p_value", "CSD_Race", "CSD_Race_p_value", "SMD_1_Race", "SMD_0_Race", "MAED_Race"])
  gpu = 0
  criterion = CustomLoss()
  epochs = 30
  prompts = load_data("")
  i = 7
  for prompt in prompts:
    print("Prompt"+str(i+1))
    kfolds = split_data(prompt, 5)
    k = 0
    for kfold in kfolds:
      X_train_all = prompt.iloc[kfold[0]]['Text'].to_list()
      X_train_all = preprocessing(X_train_all)
      y_train_all = prompt.iloc[kfold[0]]['Overall'].to_numpy()

      X_test = prompt.iloc[kfold[1]]['Text'].to_list()
      X_test = preprocessing(X_test)
      y_test = prompt.iloc[kfold[1]]['Overall'].to_numpy()
      test_info = prompt.iloc[kfold[1]]

      X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.25, random_state=seed)

      input_ids_train, attention_masks_train, labels_train = get_features(X_train, y_train)
      input_ids_val, attention_masks_val, labels_val = get_features(X_val, y_val)
      input_ids_test, attention_masks_test, labels_test = get_features(X_test, y_test)

      train_dataloader = get_data_loader(input_ids_train, attention_masks_train, labels_train, 16)
      dev_dataloader = get_data_loader(input_ids_val, attention_masks_val, labels_val, 16)
      test_dataloader = get_data_loader_for_test(input_ids_test, attention_masks_test, 16)

      model = BertThreeLayer()
      model.cuda(gpu)
      optimizer = optim.Adam(model.parameters(), lr = 5e-6)

      train(model, optimizer, criterion, train_dataloader, dev_dataloader, epochs, gpu)

      best_model = BertThreeLayer()
      best_model.cuda(gpu)
      best_model.load_state_dict(torch.load(''))

      y_pred = np.array(covert_label(predict(best_model, test_dataloader, gpu)))
      qwk, mae, pearson_score = accuracy_evaluation(y_pred, y_test)
      print(str(qwk), str(mae), str(pearson_score))
      fairness_part1_Gender, fairness_part2_Gender = fairness_evaluation(y_pred, y_test, test_info['Gender'].to_numpy())
      fairness_part1_Economically_disadvantaged, fairness_part2_Economically_disadvantaged = fairness_evaluation(y_pred, y_test, test_info['Economically_disadvantaged'].to_numpy())
      fairness_part1_Disability, fairness_part2_Disability = fairness_evaluation(y_pred, y_test, test_info['Disability'].to_numpy())
      fairness_part1_English_Language_Learner, fairness_part2_English_Language_Learner = fairness_evaluation(y_pred, y_test, test_info['English_Language_Learner'].to_numpy())
      fairness_part1_Race, fairness_part2_Race = fairness_evaluation(y_pred, y_test, test_info['Race_Binary'].to_numpy())
      new_row = {"prompt" : i+1, "fold": k+1, "quadratic_weighted_kappa": qwk, "mean_absolute_error": mae, "pearson_correlation_coefficient": pearson_score,
                      "OSA_gender": fairness_part1_Gender['Overall score accuracy']['R2'],
                      "OSA_gender_p_value": fairness_part1_Gender['Overall score accuracy']['sig'],
                      "OSD_gender": fairness_part1_Gender['Overall score difference']['R2'],
                      "OSD_gender_p_value": fairness_part1_Gender['Overall score difference']['sig'],
                      "CSD_gender": fairness_part1_Gender['Conditional score difference']['R2'],
                      "CSD_gender_p_value": fairness_part1_Gender['Conditional score difference']['sig'],
                      "SMD_1_gender":fairness_part2_Gender['SMD_1'][0],
                      "SMD_0_gender":fairness_part2_Gender['SMD_0'][0],
                      "MAED_gender":fairness_part2_Gender['diff_mae'][0],
                      "OSA_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Overall score accuracy']['R2'],
                      "OSA_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Overall score accuracy']['sig'],
                      "OSD_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Overall score difference']['R2'],
                      "OSD_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Overall score difference']['sig'],
                      "CSD_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Conditional score difference']['R2'],
                      "CSD_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Conditional score difference']['sig'],
                      "SMD_1_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['SMD_1'][0],
                      "SMD_0_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['SMD_0'][0],
                      "MAED_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['diff_mae'][0],
                      "OSA_Disability": fairness_part1_Disability['Overall score accuracy']['R2'],
                      "OSA_Disability_p_value": fairness_part1_Disability['Overall score accuracy']['sig'],
                      "OSD_Disability": fairness_part1_Disability['Overall score difference']['R2'],
                      "OSD_Disability_p_value": fairness_part1_Disability['Overall score difference']['sig'],
                      "CSD_Disability": fairness_part1_Disability['Conditional score difference']['R2'],
                      "CSD_Disability_p_value": fairness_part1_Disability['Conditional score difference']['sig'],
                      "SMD_1_Disability":fairness_part2_Disability['SMD_1'][0],
                      "SMD_0_Disability":fairness_part2_Disability['SMD_0'][0],
                      "MAED_Disability":fairness_part2_Disability['diff_mae'][0],
                      "OSA_English_Language_Learner": fairness_part1_English_Language_Learner['Overall score accuracy']['R2'],
                      "OSA_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Overall score accuracy']['sig'],
                      "OSD_English_Language_Learner": fairness_part1_English_Language_Learner['Overall score difference']['R2'],
                      "OSD_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Overall score difference']['sig'],
                      "CSD_English_Language_Learner": fairness_part1_English_Language_Learner['Conditional score difference']['R2'],
                      "CSD_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Conditional score difference']['sig'],
                      "SMD_1_English_Language_Learner":fairness_part2_English_Language_Learner['SMD_1'][0],
                      "SMD_0_English_Language_Learner":fairness_part2_English_Language_Learner['SMD_0'][0],
                      "MAED_English_Language_Learner":fairness_part2_English_Language_Learner['diff_mae'][0],
                      "OSA_Race": fairness_part1_Race['Overall score accuracy']['R2'],
                      "OSA_Race_p_value": fairness_part1_Race['Overall score accuracy']['sig'],
                      "OSD_Race": fairness_part1_Race['Overall score difference']['R2'],
                      "OSD_Race_p_value": fairness_part1_Race['Overall score difference']['sig'],
                      "CSD_Race": fairness_part1_Race['Conditional score difference']['R2'],
                      "CSD_Race_p_value": fairness_part1_Race['Conditional score difference']['sig'],
                      "SMD_1_Race":fairness_part2_Race['SMD_1'][0],
                      "SMD_0_Race":fairness_part2_Race['SMD_0'][0],
                      "MAED_Race":fairness_part2_Race['diff_mae'][0]}
      df = df.append(new_row, ignore_index=True)
      k += 1
    df.to_csv('', index=False)
    i += 1
  return df

In [None]:
result = run_experiment(0)

In [None]:
result