In [None]:
!pip install rsmtool

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.utils.data as Data
from sklearn.model_selection import train_test_split, KFold
from tqdm import tqdm
from rsmtool.utils.metrics import quadratic_weighted_kappa, difference_of_standardized_means, standardized_mean_difference
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
from rsmtool.fairness_utils import get_fairness_analyses
from nltk.data import load
from nltk.tag import pos_tag
from nltk.data import load
from sklearn.preprocessing import MinMaxScaler

nltk.download('tagsets')
tagdict = load('help/tagsets/upenn_tagset.pickle')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
pos_ids = {string: idx+1 for idx, string in enumerate(tagdict.keys())}
pos_ids["#"] = 46

In [None]:
class CustomLoss(nn.Module):
  def __init__(self):
    super(CustomLoss, self).__init__()

  def forward(self, predictions, targets):
    loss = torch.mean((predictions - targets) ** 2)
    return loss

In [None]:
class WordAttNet(nn.Module):
  def __init__(self, pos_size, hidden_size=100):
    super(WordAttNet, self).__init__()
    self.lookup = nn.Embedding(num_embeddings=pos_size, embedding_dim=50)
    self.conv1 = nn.Conv1d(in_channels=50,out_channels=100,kernel_size=5)
    self.dropout = nn.Dropout(p=0.5)
    self.fc1 = nn.Linear( 100,100)
    self.fc2 = nn.Linear( 100 , 1,bias =False)

  def forward(self, input):
    output = self.lookup(input)
    output = self.dropout(output)
    output = output.permute(1,2,0)
    f_output = self.conv1(output.float())
    f_output = f_output.permute(2,0,1)

    weight = torch.tanh(self.fc1(f_output))
    weight = self.fc2(weight)
    weight = F.softmax(weight,0)
    weight = weight * f_output
    output = weight.sum(0).unsqueeze(0)
    return output

In [None]:
class SentAttNet(nn.Module):
  def __init__(self, feature_size, sent_hidden_size=100, word_hidden_size=100):
    super(SentAttNet, self).__init__()
    self.LSTM = nn.LSTM(word_hidden_size, sent_hidden_size)
    self.fc = nn.Linear(sent_hidden_size + feature_size, 1)
    self.fc1 = nn.Linear(sent_hidden_size, sent_hidden_size)
    self.fc2 = nn.Linear(sent_hidden_size , 1, bias =False)

  def forward(self, input, feature):
    f_output, _ = self.LSTM(input)
    weight = torch.tanh(self.fc1(f_output))
    weight = self.fc2(weight)
    weight = F.softmax(weight,0)
    weight = weight * f_output
    output = weight.sum(0)
    feature = feature.to(self.fc.weight.dtype)
    final_output = torch.sigmoid(self.fc(torch.cat((output, feature), dim=1)))
    return final_output

In [None]:
class PAES(nn.Module):
  def __init__(self, word_hidden_size, sent_hidden_size, batch_size, max_sent_length, max_word_length, feature_size, pos_size):
    super(PAES, self).__init__()
    self.batch_size = batch_size
    self.word_hidden_size = word_hidden_size
    self.sent_hidden_size = sent_hidden_size
    self.max_sent_length = max_sent_length
    self.max_word_length = max_word_length
    self.word_att_net = WordAttNet(pos_size, word_hidden_size)
    self.sent_att_net = SentAttNet(feature_size, sent_hidden_size, word_hidden_size)
    self._init_hidden_state()

  def _init_hidden_state(self, last_batch_size=None):
    if last_batch_size:
      batch_size = last_batch_size
    else:
      batch_size = self.batch_size
      self.word_hidden_state = torch.zeros(2, batch_size, self.word_hidden_size)
      self.sent_hidden_state = torch.zeros(2, batch_size, self.sent_hidden_size)
      if torch.cuda.is_available():
        self.word_hidden_state = self.word_hidden_state.cuda()
        self.sent_hidden_state = self.sent_hidden_state.cuda()

  def forward(self, input, feature):
    output_list = torch.empty(0,).cuda()
    input = input.permute(1, 0, 2)
    for i in input:
      output = self.word_att_net(i.permute(1, 0))
      output_list = torch.cat((output_list, output))
    final_output= self.sent_att_net(output_list, feature)
    return final_output

In [None]:
def load_data(path):
  prompt_1 = pd.read_csv(path+'Prompt_1.csv')
  prompt_2 = pd.read_csv(path+'Prompt_2.csv')
  prompt_3 = pd.read_csv(path+'Prompt_3.csv')
  prompt_4 = pd.read_csv(path+'Prompt_4.csv')
  prompt_5 = pd.read_csv(path+'Prompt_5.csv')
  prompt_6 = pd.read_csv(path+'Prompt_6.csv')
  prompt_7 = pd.read_csv(path+'Prompt_7.csv')
  prompt_8 = pd.read_csv(path+'Prompt_8.csv')
  prompt_9 = pd.read_csv(path+'Prompt_9.csv')
  prompt_10 = pd.read_csv(path+'Prompt_10.csv')
  prompt_11 = pd.read_csv(path+'Prompt_11.csv')
  prompt_12 = pd.read_csv(path+'Prompt_12.csv')
  prompt_1_feat = pd.read_csv(path+'prompt_1_features_independent.csv')
  prompt_2_feat = pd.read_csv(path+'prompt_2_features_independent.csv')
  prompt_3_feat = pd.read_csv(path+'prompt_3_features_independent.csv')
  prompt_4_feat = pd.read_csv(path+'prompt_4_features_independent.csv')
  prompt_5_feat = pd.read_csv(path+'prompt_5_features_independent.csv')
  prompt_6_feat = pd.read_csv(path+'prompt_6_features_independent.csv')
  prompt_7_feat = pd.read_csv(path+'prompt_7_features_independent.csv')
  prompt_8_feat = pd.read_csv(path+'prompt_8_features_independent.csv')
  prompt_9_feat = pd.read_csv(path+'prompt_9_features_independent.csv')
  prompt_10_feat = pd.read_csv(path+'prompt_10_features_independent.csv')
  prompt_11_feat = pd.read_csv(path+'prompt_11_features_independent.csv')
  prompt_12_feat = pd.read_csv(path+'prompt_12_features_independent.csv')

  return [(prompt_1, prompt_1_feat), (prompt_2, prompt_2_feat),
          (prompt_3, prompt_3_feat), (prompt_4, prompt_4_feat),
          (prompt_5, prompt_5_feat), (prompt_6, prompt_6_feat),
          (prompt_7, prompt_7_feat), (prompt_8, prompt_8_feat),
          (prompt_9, prompt_9_feat), (prompt_10, prompt_10_feat),
          (prompt_11, prompt_11_feat), (prompt_12, prompt_12_feat)]

def split_data(data, fold):
    kfold = KFold(n_splits=fold, shuffle=False)
    results = []
    for train_index, test_index in kfold.split(data):
        results.append((train_index, test_index))
    return results

def accuracy_evaluation(y_pred, y_test):
    qwk = quadratic_weighted_kappa(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    pearson_score = pearsonr(y_test, y_pred).statistic
    return qwk, mae, pearson_score

def fairness_evaluation(y_pred, y_test, demo_attribute):
    df = pd.DataFrame({"True_Score":y_test, "Prediction_Score":y_pred, "Demo":demo_attribute})
    results = get_fairness_analyses(df, group="Demo", system_score_column="Prediction_Score", human_score_column="True_Score")[1].values()[3]
    population_y_true_observed_sd = np.std(y_test)
    population_y_true_observed_mn = np.mean(y_test)
    population_y_pred_sd = np.std(y_pred)
    population_y_pred_mn = np.mean(y_pred)
    y_test_demo_0 = y_test[np.where(demo_attribute==0)]
    y_test_demo_1 = y_test[np.where(demo_attribute==1)]
    y_pred_demo_0 = y_pred[np.where(demo_attribute==0)]
    y_pred_demo_1 = y_pred[np.where(demo_attribute==1)]
    SMD_0 = difference_of_standardized_means(y_test_demo_0, y_pred_demo_0, population_y_true_observed_mn, population_y_pred_mn, population_y_true_observed_sd, population_y_pred_sd)
    SMD_1 = difference_of_standardized_means(y_test_demo_1, y_pred_demo_1, population_y_true_observed_mn, population_y_pred_mn, population_y_true_observed_sd, population_y_pred_sd)
    diff_mae = mean_absolute_error(y_test_demo_1, y_pred_demo_1) - mean_absolute_error(y_test_demo_0, y_pred_demo_0)
    scores = pd.DataFrame({"SMD_0":[SMD_0], "SMD_1":[SMD_1], "diff_mae":[diff_mae]})
    return results, scores

def covert_label(y_pred):
  range_min = 1
  range_max = 6
  y_orig = [(score*(range_max-range_min)+range_min) for score in y_pred]
  return y_orig

In [None]:
def get_features(texts, labels):
  range_min = 1
  range_max = 6
  y = (labels-range_min)/(range_max-range_min)

  maxSenNum = 0
  maxSenLen = 0
  documents = []
  for text in texts:
    sentences = sent_tokenize(text)
    word_tokens = [[pos_ids[tag] for _, tag in pos_tag(word_tokenize(sentence), lang='eng')] for sentence in sentences]
    documents.append(word_tokens)
    if len(sentences) > maxSenNum:
      maxSenNum = len(sentences)
    if max([len(word_token) for word_token in word_tokens]) > maxSenLen:
      maxSenLen = max([len(word_token) for word_token in word_tokens])

  X = []
  for document in documents:
    X.append(pad_sequences(document, maxlen=maxSenLen))
  max_shape = np.array(max(X, key=lambda x: x.shape)).shape
  padded_X = np.array([np.pad(array, ((0, max_shape[0]-array.shape[0]), (0, max_shape[1]-array.shape[1])), mode='constant', constant_values=0) for array in X])
  return padded_X, y, maxSenNum, maxSenLen

In [None]:
def get_features_for_test(texts, labels, maxSenNum, maxSenLen):
  range_min = 1
  range_max = 6
  y = (labels-range_min)/(range_max-range_min)

  documents = []
  for text in texts:
    sentences = sent_tokenize(text)
    word_tokens = [[pos_ids[tag] for _, tag in pos_tag(word_tokenize(sentence), lang='eng')] for sentence in sentences]
    documents.append(word_tokens)

  X = []
  for document in documents:
    X.append(pad_sequences(document, maxlen=maxSenLen))
  padded_X = np.array([np.pad(array, ((0, maxSenNum-array.shape[0]), (0, maxSenLen-array.shape[1])), mode='constant', constant_values=0) for array in X])
  return padded_X, y

In [None]:
def train(model, optimizer, criterion, train_dataloader, dev_dataloader, epochs):
  best_qwk = float('-inf')
  for i in range(0, epochs):
    total_loss = 0
    count = 0
    model.train()
    print("Epoch " + str(i+1))
    for iter, (feature_1, feature_2, label) in tqdm(enumerate(train_dataloader)):
      model.zero_grad()
      cuda_feature_1 = feature_1.cuda()
      cuda_feature_2 = feature_2.cuda()
      cuda_labels = label.cuda()
      outputs = model(cuda_feature_1, cuda_feature_2)
      loss = criterion(outputs, cuda_labels)
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
      count += 1

    qwk = evaluate(model, criterion, dev_dataloader)
    if best_qwk < qwk:
        best_qwk = qwk
        torch.save(model.state_dict(), '')
    print("Epoch {} complete, train loss: {}, dev qwk: {}".format(i+1, total_loss/count, qwk))

In [None]:
def evaluate(model, criterion, dev_dataloader):
  model.eval()
  y_pred = []
  y_true = []
  with torch.no_grad():
    for iter, (feature_1, feature_2, label) in enumerate(dev_dataloader):
      cuda_feature_1 = feature_1.cuda()
      cuda_feature_2 = feature_2.cuda()
      cuda_labels = label.cuda()
      outputs = model(cuda_feature_1, cuda_feature_2)

      results = outputs.squeeze(-1)
      results = results.detach().cpu().numpy()
      for result in results:
        y_pred.append(result)

      labels = cuda_labels.squeeze(-1)
      labels = labels.detach().cpu().numpy()
      for label in labels:
        y_true.append(label)
  y_true = covert_label(y_true)
  y_pred = covert_label(y_pred)
  qwk = quadratic_weighted_kappa(y_true, y_pred)
  return qwk

In [None]:
def predict(model, dataloader):
  model.eval()
  y_pred = []
  y_true = []
  with torch.no_grad():
    for iter, (feature_1, feature_2, label) in enumerate(dataloader):
      cuda_feature_1 = feature_1.cuda()
      cuda_feature_2 = feature_2.cuda()
      cuda_labels = label.cuda()
      outputs = model(cuda_feature_1, cuda_feature_2)

      results = outputs.squeeze(-1)
      results = results.detach().cpu().numpy()
      for result in results:
        y_pred.append(result)

      labels = cuda_labels.squeeze(-1)
      labels = labels.detach().cpu().numpy()
      for label in labels:
        y_true.append(label)
  y_true = covert_label(y_true)
  y_pred = covert_label(y_pred)
  return np.array(y_pred), np.array(y_true)

In [None]:
def run_experiment(seed):
  df = pd.DataFrame(columns=["prompt", "fold", "quadratic_weighted_kappa", "mean_absolute_error", "pearson_correlation_coefficient",
                              "OSA_gender", "OSA_gender_p_value", "OSD_gender", "OSD_gender_p_value", "CSD_gender", "CSD_gender_p_value", "SMD_1_gender", "SMD_0_gender", "MAED_gender",
                              "OSA_Economically_disadvantaged", "OSA_Economically_disadvantaged_p_value", "OSD_Economically_disadvantaged", "OSD_Economically_disadvantaged_p_value", "CSD_Economically_disadvantaged", "CSD_Economically_disadvantaged_p_value", "SMD_1_Economically_disadvantaged", "SMD_0_Economically_disadvantaged", "MAED_Economically_disadvantaged",
                              "OSA_Disability", "OSA_Disability_p_value", "OSD_Disability", "OSD_Disability_p_value", "CSD_Disability", "CSD_Disability_p_value", "SMD_1_Disability", "SMD_0_Disability", "MAED_Disability",
                              "OSA_English_Language_Learner", "OSA_English_Language_Learner_p_value", "OSD_English_Language_Learner", "OSD_English_Language_Learner_p_value", "CSD_English_Language_Learner", "CSD_English_Language_Learner_p_value", "SMD_1_English_Language_Learner", "SMD_0_English_Language_Learner", "MAED_English_Language_Learner",
                              "OSA_Race", "OSA_Race_p_value", "OSD_Race", "OSD_Race_p_value", "CSD_Race", "CSD_Race_p_value", "SMD_1_Race", "SMD_0_Race", "MAED_Race"])
  criterion = CustomLoss()
  epochs = 50
  prompts = load_data("")
  batch_size = 64
    X_train_list = []
    X_train_add_list = []
    y_train_list = []
    kfolds = []
    for j in range(len(prompts)):
      if j != i:
        X_train_list.append(prompts[j][0]['Text'])
        X_train_add_list.append(prompts[j][1])
        y_train_list.append(prompts[j][0]['Overall'])
      if j == i:
        kfolds = split_data(prompts[j][0], 5)

    scaler = MinMaxScaler()
    X_train_add = scaler.fit_transform(pd.concat(X_train_add_list))
    X, y, maxSenNum, maxSenLen = get_features(pd.concat(X_train_list).to_numpy(), pd.concat(y_train_list).to_numpy())
    X_train, X_val, X_add_train, X_add_val, y_train, y_val = train_test_split(X, X_train_add, y, test_size=0.2, random_state=0)
    print(maxSenNum, maxSenLen)
    train_data = Data.TensorDataset(torch.tensor(X_train), torch.tensor(X_add_train), torch.tensor(y_train.reshape(-1, 1)))
    val_data = Data.TensorDataset(torch.tensor(X_val), torch.tensor(X_add_val), torch.tensor(y_val.reshape(-1, 1)))
    train_dataloader = Data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
    dev_dataloader = Data.DataLoader(dataset=val_data, batch_size=batch_size, shuffle=False)

    model = PAES(100, 100, batch_size, maxSenNum, maxSenLen, 26, 47)
    model.cuda()
    optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, alpha=0.9)
    model.word_att_net.lookup.weight.requires_grad = True

    train(model, optimizer, criterion, train_dataloader, dev_dataloader, epochs)

    best_model = PAES(100, 100, batch_size, maxSenNum, maxSenLen, 26, 47)
    best_model.cuda()
    best_model.load_state_dict(torch.load(''))

    for k in range(len(kfolds)):
      print("Prompt "+str(i+1)+" Fold "+str(k+1)+":")
      X_test = prompts[i][0].iloc[kfolds[k][1]]['Text'].to_numpy()
      test_info = prompts[i][0].iloc[kfolds[k][1]]
      X_test_add = prompts[i][1].iloc[kfolds[k][1]]
      y_test = prompts[i][0].iloc[kfolds[k][1]]['Overall'].to_numpy()

      X_test, y_test = get_features_for_test(X_test, y_test, maxSenNum, maxSenLen)
      X_test_add = scaler.fit_transform(X_test_add)
      test_data = Data.TensorDataset(torch.tensor(X_test), torch.tensor(X_test_add), torch.tensor(y_test.reshape(-1, 1)))
      test_dataloader = Data.DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)
      y_pred, y_true = predict(best_model, test_dataloader)
      qwk, mae, pearson_score = accuracy_evaluation(y_pred, y_true)
      print(str(qwk), str(mae), str(pearson_score))
      fairness_part1_Gender, fairness_part2_Gender = fairness_evaluation(y_pred, y_true, test_info['Gender'].to_numpy())
      fairness_part1_Economically_disadvantaged, fairness_part2_Economically_disadvantaged = fairness_evaluation(y_pred, y_true, test_info['Economically_disadvantaged'].to_numpy())
      fairness_part1_Disability, fairness_part2_Disability = fairness_evaluation(y_pred, y_true, test_info['Disability'].to_numpy())
      fairness_part1_English_Language_Learner, fairness_part2_English_Language_Learner = fairness_evaluation(y_pred, y_true, test_info['English_Language_Learner'].to_numpy())
      fairness_part1_Race, fairness_part2_Race = fairness_evaluation(y_pred, y_true, test_info['Race_Binary'].to_numpy())
      new_row = {"prompt" : i+1, "fold": k+1, "quadratic_weighted_kappa": qwk, "mean_absolute_error": mae, "pearson_correlation_coefficient": pearson_score,
                      "OSA_gender": fairness_part1_Gender['Overall score accuracy']['R2'],
                      "OSA_gender_p_value": fairness_part1_Gender['Overall score accuracy']['sig'],
                      "OSD_gender": fairness_part1_Gender['Overall score difference']['R2'],
                      "OSD_gender_p_value": fairness_part1_Gender['Overall score difference']['sig'],
                      "CSD_gender": fairness_part1_Gender['Conditional score difference']['R2'],
                      "CSD_gender_p_value": fairness_part1_Gender['Conditional score difference']['sig'],
                      "SMD_1_gender":fairness_part2_Gender['SMD_1'][0],
                      "SMD_0_gender":fairness_part2_Gender['SMD_0'][0],
                      "MAED_gender":fairness_part2_Gender['diff_mae'][0],
                      "OSA_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Overall score accuracy']['R2'],
                      "OSA_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Overall score accuracy']['sig'],
                      "OSD_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Overall score difference']['R2'],
                      "OSD_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Overall score difference']['sig'],
                      "CSD_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Conditional score difference']['R2'],
                      "CSD_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Conditional score difference']['sig'],
                      "SMD_1_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['SMD_1'][0],
                      "SMD_0_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['SMD_0'][0],
                      "MAED_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['diff_mae'][0],
                      "OSA_Disability": fairness_part1_Disability['Overall score accuracy']['R2'],
                      "OSA_Disability_p_value": fairness_part1_Disability['Overall score accuracy']['sig'],
                      "OSD_Disability": fairness_part1_Disability['Overall score difference']['R2'],
                      "OSD_Disability_p_value": fairness_part1_Disability['Overall score difference']['sig'],
                      "CSD_Disability": fairness_part1_Disability['Conditional score difference']['R2'],
                      "CSD_Disability_p_value": fairness_part1_Disability['Conditional score difference']['sig'],
                      "SMD_1_Disability":fairness_part2_Disability['SMD_1'][0],
                      "SMD_0_Disability":fairness_part2_Disability['SMD_0'][0],
                      "MAED_Disability":fairness_part2_Disability['diff_mae'][0],
                      "OSA_English_Language_Learner": fairness_part1_English_Language_Learner['Overall score accuracy']['R2'],
                      "OSA_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Overall score accuracy']['sig'],
                      "OSD_English_Language_Learner": fairness_part1_English_Language_Learner['Overall score difference']['R2'],
                      "OSD_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Overall score difference']['sig'],
                      "CSD_English_Language_Learner": fairness_part1_English_Language_Learner['Conditional score difference']['R2'],
                      "CSD_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Conditional score difference']['sig'],
                      "SMD_1_English_Language_Learner":fairness_part2_English_Language_Learner['SMD_1'][0],
                      "SMD_0_English_Language_Learner":fairness_part2_English_Language_Learner['SMD_0'][0],
                      "MAED_English_Language_Learner":fairness_part2_English_Language_Learner['diff_mae'][0],
                      "OSA_Race": fairness_part1_Race['Overall score accuracy']['R2'],
                      "OSA_Race_p_value": fairness_part1_Race['Overall score accuracy']['sig'],
                      "OSD_Race": fairness_part1_Race['Overall score difference']['R2'],
                      "OSD_Race_p_value": fairness_part1_Race['Overall score difference']['sig'],
                      "CSD_Race": fairness_part1_Race['Conditional score difference']['R2'],
                      "CSD_Race_p_value": fairness_part1_Race['Conditional score difference']['sig'],
                      "SMD_1_Race":fairness_part2_Race['SMD_1'][0],
                      "SMD_0_Race":fairness_part2_Race['SMD_0'][0],
                      "MAED_Race":fairness_part2_Race['diff_mae'][0]}
      df = df._append(new_row, ignore_index=True)
    df.to_csv('', index=False)
  return df

In [None]:
run_experiment(0)

In [None]:
df.to_csv('', index=False)