In [None]:
import warnings
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from rsmtool.utils.metrics import quadratic_weighted_kappa, difference_of_standardized_means, standardized_mean_difference
from scipy.stats import pearsonr
from collections import Counter
from sklearn.metrics import cohen_kappa_score
from rsmtool.fairness_utils import get_fairness_analyses
from sklearn.model_selection import KFold
import nltk
nltk.download('punkt')
nltk.download('tagsets')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import ngrams
from nltk.corpus import stopwords
import string
import spacy
import lftk

In [None]:
def load_data(path):
    prompt_1 = pd.read_csv(path+'Prompt_1.csv')
    prompt_2 = pd.read_csv(path+'Prompt_2.csv')
    prompt_3 = pd.read_csv(path+'Prompt_3.csv')
    prompt_4 = pd.read_csv(path+'Prompt_4.csv')
    prompt_5 = pd.read_csv(path+'Prompt_5.csv')
    prompt_6 = pd.read_csv(path+'Prompt_6.csv')
    prompt_7 = pd.read_csv(path+'Prompt_7.csv')
    prompt_8 = pd.read_csv(path+'Prompt_8.csv')
    prompt_9 = pd.read_csv(path+'Prompt_9.csv')
    prompt_10 = pd.read_csv(path+'Prompt_10.csv')
    prompt_11 = pd.read_csv(path+'Prompt_11.csv')
    prompt_12 = pd.read_csv(path+'Prompt_12.csv')
    prompt_1_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_1_features_independent.csv')
    prompt_2_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_2_features_independent.csv')
    prompt_3_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_3_features_independent.csv')
    prompt_4_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_4_features_independent.csv')
    prompt_5_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_5_features_independent.csv')
    prompt_6_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_6_features_independent.csv')
    prompt_7_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_7_features_independent.csv')
    prompt_8_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_8_features_independent.csv')
    prompt_9_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_9_features_independent.csv')
    prompt_10_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_10_features_independent.csv')
    prompt_11_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_11_features_independent.csv')
    prompt_12_features_independent = pd.read_csv(path+'Task-Independent Features for Automated Essay Grading/prompt_12_features_independent.csv')
    return [(prompt_1, prompt_1_features_independent), (prompt_2, prompt_2_features_independent), (prompt_3, prompt_3_features_independent), (prompt_4, prompt_4_features_independent), (prompt_5, prompt_5_features_independent), (prompt_6, prompt_6_features_independent),
          (prompt_7, prompt_7_features_independent), (prompt_8, prompt_8_features_independent), (prompt_9, prompt_9_features_independent), (prompt_10, prompt_10_features_independent), (prompt_11, prompt_11_features_independent), (prompt_12, prompt_12_features_independent)]

def accuracy_evaluation(model, X_test, y_test):
    y_pred = model.predict(X_test)
    qwk = quadratic_weighted_kappa(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    pearson_score = pearsonr(y_test, y_pred).statistic
    return qwk, mae, pearson_score

def fairness_evaluation(model, X_test, y_test, demo_attribute):
    y_pred = model.predict(X_test)
    df = pd.DataFrame({"True_Score":y_test, "Prediction_Score":y_pred, "Demo":demo_attribute})
    results = get_fairness_analyses(df, group="Demo", system_score_column="Prediction_Score", human_score_column="True_Score")[1].values()[3]
    population_y_true_observed_sd = np.std(y_test)
    population_y_true_observed_mn = np.mean(y_test)
    population_y_pred_sd = np.std(y_pred)
    population_y_pred_mn = np.mean(y_pred)
    y_test_demo_0 = y_test[np.where(demo_attribute==0)]
    y_test_demo_1 = y_test[np.where(demo_attribute==1)]
    y_pred_demo_0 = y_pred[np.where(demo_attribute==0)]
    y_pred_demo_1 = y_pred[np.where(demo_attribute==1)]
    SMD_0 = difference_of_standardized_means(y_test_demo_0, y_pred_demo_0, population_y_true_observed_mn, population_y_pred_mn, population_y_true_observed_sd, population_y_pred_sd)
    SMD_1 = difference_of_standardized_means(y_test_demo_1, y_pred_demo_1, population_y_true_observed_mn, population_y_pred_mn, population_y_true_observed_sd, population_y_pred_sd)
    diff_mae = mean_absolute_error(y_test_demo_1, y_pred_demo_1) - mean_absolute_error(y_test_demo_0, y_pred_demo_0)
    scores = pd.DataFrame({"SMD_0":[SMD_0], "SMD_1":[SMD_1], "diff_mae":[diff_mae]})
    return results, scores

def split_data(data, fold):
    kfold = KFold(n_splits=fold, shuffle=False)
    results = []
    for train_index, test_index in kfold.split(data):
        results.append((train_index, test_index))
    return results

def generate_feature(data):
    sentence_result = []
    word_result = []
    for text in data['Text'].tolist():
        sentences = nltk.sent_tokenize(text)
        num_sentences = len(sentences)
        words = nltk.word_tokenize(text)
        num_words = len(words)
        sentence_result.append(num_sentences)
        word_result.append(num_words)
    features = pd.DataFrame({"num_words":word_result, "num_sentences": sentence_result})
    return features

def generate_word_n_gram(train, test):
    ngram_freq = Counter()
    stopword_list = set(stopwords.words('english'))
    translator = str.maketrans("", "", string.punctuation)
    for text in train['Text'].tolist():
        text = text.translate(translator)
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word not in stopword_list]
        unigrams_list = list(ngrams(tokens, 1))
        bigrams_list = list(ngrams(tokens, 2))
        trigrams_list = list(ngrams(tokens, 3))
        ngram_freq.update(unigrams_list)
        ngram_freq.update(bigrams_list)
        ngram_freq.update(trigrams_list)
    
    top_ngrams = dict(ngram_freq.most_common(1000))
    
    train_result = []
    test_result = []
    
    for text in train['Text'].tolist():
        text = text.translate(translator)
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word not in stopword_list]
        text_ngram_freq = Counter()
        unigrams_list = list(ngrams(tokens, 1))
        bigrams_list = list(ngrams(tokens, 2))
        trigrams_list = list(ngrams(tokens, 3))
        text_ngram_freq.update(unigrams_list)
        text_ngram_freq.update(bigrams_list)
        text_ngram_freq.update(trigrams_list)
        text_features = [text_ngram_freq[ngrams] if ngrams in text_ngram_freq.keys() else 0 for ngrams in top_ngrams.keys()]
        train_result.append(text_features)
    
    for text in test['Text'].tolist():
        text = text.translate(translator)
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word not in stopword_list]
        text_ngram_freq = Counter()
        unigrams_list = list(ngrams(tokens, 1))
        bigrams_list = list(ngrams(tokens, 2))
        trigrams_list = list(ngrams(tokens, 3))
        text_ngram_freq.update(unigrams_list)
        text_ngram_freq.update(bigrams_list)
        text_ngram_freq.update(trigrams_list)
        text_features = [text_ngram_freq[ngrams] if ngrams in text_ngram_freq.keys() else 0 for ngrams in top_ngrams.keys()]
        test_result.append(text_features)
    
    column_name = ['ngram_'+ str(i+1) for i in range(1000)]
    return pd.DataFrame(train_result, columns=column_name), pd.DataFrame(test_result, columns=column_name)

def generate_pos_n_gram(train, test):
    pos_ngram_freq = Counter()
    for text in train['Text'].tolist():
        tokens = word_tokenize(text.lower())
        pos_tags = nltk.pos_tag(tokens)
        tags = [tag[1] for tag in pos_tags]
        unigrams_list = list(ngrams(tags, 1))
        bigrams_list = list(ngrams(tags, 2))
        trigrams_list = list(ngrams(tags, 3))
        pos_ngram_freq.update(unigrams_list)
        pos_ngram_freq.update(bigrams_list)
        pos_ngram_freq.update(trigrams_list)

    top_pos_ngrams = dict(pos_ngram_freq.most_common(1000))
    
    train_result = []
    test_result = []
    
    for text in train['Text'].tolist():
        tokens = word_tokenize(text.lower())
        pos_tags = nltk.pos_tag(tokens)
        tags = [tag[1] for tag in pos_tags]
        text_pos_ngram_freq = Counter()
        unigrams_list = list(ngrams(tags, 1))
        bigrams_list = list(ngrams(tags, 2))
        trigrams_list = list(ngrams(tags, 3))
        text_pos_ngram_freq.update(unigrams_list)
        text_pos_ngram_freq.update(bigrams_list)
        text_pos_ngram_freq.update(trigrams_list)
        text_features = [text_pos_ngram_freq[ngrams] if ngrams in text_pos_ngram_freq.keys() else 0 for ngrams in top_pos_ngrams.keys()]
        train_result.append(text_features)
        
    for text in test['Text'].tolist():
        tokens = word_tokenize(text.lower())
        pos_tags = nltk.pos_tag(tokens)
        tags = [tag[1] for tag in pos_tags]
        text_pos_ngram_freq = Counter()
        unigrams_list = list(ngrams(tags, 1))
        bigrams_list = list(ngrams(tags, 2))
        trigrams_list = list(ngrams(tags, 3))
        text_pos_ngram_freq.update(unigrams_list)
        text_pos_ngram_freq.update(bigrams_list)
        text_pos_ngram_freq.update(trigrams_list)
        text_features = [text_pos_ngram_freq[ngrams] if ngrams in text_pos_ngram_freq.keys() else 0 for ngrams in top_pos_ngrams.keys()]
        test_result.append(text_features)
    
    column_name = ['pos_ngram_'+ str(i+1) for i in range(1000)]
    return pd.DataFrame(train_result, columns=column_name), pd.DataFrame(test_result, columns=column_name)

def generate_partition_word_ngram(train, test):
    ngram_freq_1 = Counter()
    ngram_freq_2 = Counter()
    ngram_freq_3 = Counter()
    ngram_freq_4 = Counter()
    ngram_freq_5 = Counter()
    stopword_list = set(stopwords.words('english'))
    translator = str.maketrans("", "", string.punctuation)
    for text in train['Text'].tolist():
        text = text.translate(translator)
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word not in stopword_list]
        tokens_part_size = len(tokens) // 5
        
        tokens_1 = tokens[:tokens_part_size]
        tokens_2 = tokens[tokens_part_size:tokens_part_size*2]
        tokens_3 = tokens[tokens_part_size*2:tokens_part_size*3]
        tokens_4 = tokens[tokens_part_size*3:tokens_part_size*4]
        tokens_5 = tokens[tokens_part_size*4:]
        
        unigrams_list_1 = list(ngrams(tokens_1, 1))
        bigrams_list_1 = list(ngrams(tokens_1, 2))
        trigrams_list_1 = list(ngrams(tokens_1, 3))
        ngram_freq_1.update(unigrams_list_1)
        ngram_freq_1.update(bigrams_list_1)
        ngram_freq_1.update(trigrams_list_1)
        
        unigrams_list_2 = list(ngrams(tokens_2, 1))
        bigrams_list_2 = list(ngrams(tokens_2, 2))
        trigrams_list_2 = list(ngrams(tokens_2, 3))
        ngram_freq_2.update(unigrams_list_2)
        ngram_freq_2.update(bigrams_list_2)
        ngram_freq_2.update(trigrams_list_2)
        
        unigrams_list_3 = list(ngrams(tokens_3, 1))
        bigrams_list_3 = list(ngrams(tokens_3, 2))
        trigrams_list_3 = list(ngrams(tokens_3, 3))
        ngram_freq_3.update(unigrams_list_3)
        ngram_freq_3.update(bigrams_list_3)
        ngram_freq_3.update(trigrams_list_3)
        
        unigrams_list_4 = list(ngrams(tokens_4, 1))
        bigrams_list_4 = list(ngrams(tokens_4, 2))
        trigrams_list_4 = list(ngrams(tokens_4, 3))
        ngram_freq_4.update(unigrams_list_4)
        ngram_freq_4.update(bigrams_list_4)
        ngram_freq_4.update(trigrams_list_4)
        
        unigrams_list_5 = list(ngrams(tokens_5, 1))
        bigrams_list_5 = list(ngrams(tokens_5, 2))
        trigrams_list_5 = list(ngrams(tokens_5, 3))
        ngram_freq_5.update(unigrams_list_5)
        ngram_freq_5.update(bigrams_list_5)
        ngram_freq_5.update(trigrams_list_5)
          
    top_ngrams_1 = dict(ngram_freq_1.most_common(1000))
    top_ngrams_2 = dict(ngram_freq_2.most_common(1000))
    top_ngrams_3 = dict(ngram_freq_3.most_common(1000))
    top_ngrams_4 = dict(ngram_freq_4.most_common(1000))
    top_ngrams_5 = dict(ngram_freq_5.most_common(1000))
    
    train_result = []
    for text in train['Text'].tolist():
        text = text.translate(translator)
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word not in stopword_list]
        tokens_part_size = len(tokens) // 5
    
        tokens_1 = tokens[:tokens_part_size]
        tokens_2 = tokens[tokens_part_size:tokens_part_size*2]
        tokens_3 = tokens[tokens_part_size*2:tokens_part_size*3]
        tokens_4 = tokens[tokens_part_size*3:tokens_part_size*4]
        tokens_5 = tokens[tokens_part_size*4:]
        
        text_ngram_freq_1 = Counter()
        unigrams_list_1 = list(ngrams(tokens_1, 1))
        bigrams_list_1 = list(ngrams(tokens_1, 2))
        trigrams_list_1 = list(ngrams(tokens_1, 3))
        text_ngram_freq_1.update(unigrams_list_1)
        text_ngram_freq_1.update(bigrams_list_1)
        text_ngram_freq_1.update(trigrams_list_1)
        text_features = [text_ngram_freq_1[ngrams] if ngrams in text_ngram_freq_1.keys() else 0 for ngrams in top_ngrams_1.keys()]
        
        text_ngram_freq_2 = Counter()
        unigrams_list_2 = list(ngrams(tokens_2, 1))
        bigrams_list_2 = list(ngrams(tokens_2, 2))
        trigrams_list_2 = list(ngrams(tokens_2, 3))
        text_ngram_freq_2.update(unigrams_list_2)
        text_ngram_freq_2.update(bigrams_list_2)
        text_ngram_freq_2.update(trigrams_list_2)
        text_features.extend([text_ngram_freq_2[ngrams] if ngrams in text_ngram_freq_2.keys() else 0 for ngrams in top_ngrams_2.keys()])
        
        text_ngram_freq_3 = Counter()
        unigrams_list_3 = list(ngrams(tokens_3, 1))
        bigrams_list_3 = list(ngrams(tokens_3, 2))
        trigrams_list_3 = list(ngrams(tokens_3, 3))
        text_ngram_freq_3.update(unigrams_list_3)
        text_ngram_freq_3.update(bigrams_list_3)
        text_ngram_freq_3.update(trigrams_list_3)
        text_features.extend([text_ngram_freq_3[ngrams] if ngrams in text_ngram_freq_3.keys() else 0 for ngrams in top_ngrams_3.keys()])
        
        text_ngram_freq_4 = Counter()
        unigrams_list_4 = list(ngrams(tokens_4, 1))
        bigrams_list_4 = list(ngrams(tokens_4, 2))
        trigrams_list_4 = list(ngrams(tokens_4, 3))
        text_ngram_freq_4.update(unigrams_list_4)
        text_ngram_freq_4.update(bigrams_list_4)
        text_ngram_freq_4.update(trigrams_list_4)
        text_features.extend([text_ngram_freq_4[ngrams] if ngrams in text_ngram_freq_4.keys() else 0 for ngrams in top_ngrams_4.keys()])

        text_ngram_freq_5 = Counter()
        unigrams_list_5 = list(ngrams(tokens_5, 1))
        bigrams_list_5 = list(ngrams(tokens_5, 2))
        trigrams_list_5 = list(ngrams(tokens_5, 3))
        text_ngram_freq_5.update(unigrams_list_5)
        text_ngram_freq_5.update(bigrams_list_5)
        text_ngram_freq_5.update(trigrams_list_5)
        text_features.extend([text_ngram_freq_5[ngrams] if ngrams in text_ngram_freq_5.keys() else 0 for ngrams in top_ngrams_5.keys()])
        
        train_result.append(text_features)

    test_result = []
    for text in test['Text'].tolist():
        text = text.translate(translator)
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word not in stopword_list]
        tokens_part_size = len(tokens) // 5
    
        tokens_1 = tokens[:tokens_part_size]
        tokens_2 = tokens[tokens_part_size:tokens_part_size*2]
        tokens_3 = tokens[tokens_part_size*2:tokens_part_size*3]
        tokens_4 = tokens[tokens_part_size*3:tokens_part_size*4]
        tokens_5 = tokens[tokens_part_size*4:]
        
        text_ngram_freq_1 = Counter()
        unigrams_list_1 = list(ngrams(tokens_1, 1))
        bigrams_list_1 = list(ngrams(tokens_1, 2))
        trigrams_list_1 = list(ngrams(tokens_1, 3))
        text_ngram_freq_1.update(unigrams_list_1)
        text_ngram_freq_1.update(bigrams_list_1)
        text_ngram_freq_1.update(trigrams_list_1)
        text_features = [text_ngram_freq_1[ngrams] if ngrams in text_ngram_freq_1.keys() else 0 for ngrams in top_ngrams_1.keys()]
        
        text_ngram_freq_2 = Counter()
        unigrams_list_2 = list(ngrams(tokens_2, 1))
        bigrams_list_2 = list(ngrams(tokens_2, 2))
        trigrams_list_2 = list(ngrams(tokens_2, 3))
        text_ngram_freq_2.update(unigrams_list_2)
        text_ngram_freq_2.update(bigrams_list_2)
        text_ngram_freq_2.update(trigrams_list_2)
        text_features.extend([text_ngram_freq_2[ngrams] if ngrams in text_ngram_freq_2.keys() else 0 for ngrams in top_ngrams_2.keys()])
        
        text_ngram_freq_3 = Counter()
        unigrams_list_3 = list(ngrams(tokens_3, 1))
        bigrams_list_3 = list(ngrams(tokens_3, 2))
        trigrams_list_3 = list(ngrams(tokens_3, 3))
        text_ngram_freq_3.update(unigrams_list_3)
        text_ngram_freq_3.update(bigrams_list_3)
        text_ngram_freq_3.update(trigrams_list_3)
        text_features.extend([text_ngram_freq_3[ngrams] if ngrams in text_ngram_freq_3.keys() else 0 for ngrams in top_ngrams_3.keys()])
        
        text_ngram_freq_4 = Counter()
        unigrams_list_4 = list(ngrams(tokens_4, 1))
        bigrams_list_4 = list(ngrams(tokens_4, 2))
        trigrams_list_4 = list(ngrams(tokens_4, 3))
        text_ngram_freq_4.update(unigrams_list_4)
        text_ngram_freq_4.update(bigrams_list_4)
        text_ngram_freq_4.update(trigrams_list_4)
        text_features.extend([text_ngram_freq_4[ngrams] if ngrams in text_ngram_freq_4.keys() else 0 for ngrams in top_ngrams_4.keys()])

        text_ngram_freq_5 = Counter()
        unigrams_list_5 = list(ngrams(tokens_5, 1))
        bigrams_list_5 = list(ngrams(tokens_5, 2))
        trigrams_list_5 = list(ngrams(tokens_5, 3))
        text_ngram_freq_5.update(unigrams_list_5)
        text_ngram_freq_5.update(bigrams_list_5)
        text_ngram_freq_5.update(trigrams_list_5)
        text_features.extend([text_ngram_freq_5[ngrams] if ngrams in text_ngram_freq_5.keys() else 0 for ngrams in top_ngrams_5.keys()])
        
        test_result.append(text_features)
    
        
    column_name = ['partition_pos_ngram_'+ str(i+1) for i in range(5000)]
    return pd.DataFrame(train_result, columns=column_name), pd.DataFrame(test_result, columns=column_name)

In [None]:
def cross_validation(prompts_list):
    df = pd.DataFrame(columns=["prompt", "fold", "quadratic_weighted_kappa", "mean_absolute_error", "pearson_correlation_coefficient",
                              "OSA_gender", "OSA_gender_p_value", "OSD_gender", "OSD_gender_p_value", "CSD_gender", "CSD_gender_p_value", "SMD_1_gender", "SMD_0_gender", "MAED_gender",
                              "OSA_Economically_disadvantaged", "OSA_Economically_disadvantaged_p_value", "OSD_Economically_disadvantaged", "OSD_Economically_disadvantaged_p_value", "CSD_Economically_disadvantaged", "CSD_Economically_disadvantaged_p_value", "SMD_1_Economically_disadvantaged", "SMD_0_Economically_disadvantaged", "MAED_Economically_disadvantaged",
                              "OSA_Disability", "OSA_Disability_p_value", "OSD_Disability", "OSD_Disability_p_value", "CSD_Disability", "CSD_Disability_p_value", "SMD_1_Disability", "SMD_0_Disability", "MAED_Disability",
                              "OSA_English_Language_Learner", "OSA_English_Language_Learner_p_value", "OSD_English_Language_Learner", "OSD_English_Language_Learner_p_value", "CSD_English_Language_Learner", "CSD_English_Language_Learner_p_value", "SMD_1_English_Language_Learner", "SMD_0_English_Language_Learner", "MAED_English_Language_Learner",
                              "OSA_Race", "OSA_Race_p_value", "OSD_Race", "OSD_Race_p_value", "CSD_Race", "CSD_Race_p_value", "SMD_1_Race", "SMD_0_Race", "MAED_Race"])
    for i in tqdm(range(len(prompts_list))):
        kfolds = split_data(prompts_list[i][0], 5)
        essay_length_feature = generate_feature(prompts_list[i][0])
        features = pd.concat([essay_length_feature, prompts_list[i][1]], axis=1)
        k = 0
        for pair in tqdm(kfolds):
            train_features_1, test_features_1 = generate_word_n_gram(prompts_list[i][0].iloc[pair[0]], prompts_list[i][0].iloc[pair[1]])
            train_features_2, test_features_2 = generate_pos_n_gram(prompts_list[i][0].iloc[pair[0]], prompts_list[i][0].iloc[pair[1]])
            train_features_3, test_features_3 = generate_partition_word_ngram(prompts_list[i][0].iloc[pair[0]], prompts_list[i][0].iloc[pair[1]])
            X_train = pd.concat([train_features_1, train_features_2, train_features_3, features.iloc[pair[0]].reset_index(drop=True)], axis=1)
            y_train = prompts_list[i][0].iloc[pair[0]]['Overall'].to_numpy()
            X_test = pd.concat([test_features_1, test_features_2, test_features_3, features.iloc[pair[1]].reset_index(drop=True)], axis=1)
            y_test = prompts_list[i][0].iloc[pair[1]]['Overall'].to_numpy()
            test_info = prompts_list[i][0].iloc[pair[1]]
            
            ssc = StandardScaler()
            X_train = ssc.fit_transform(X_train)
            model = SVR()
            model.fit(X_train, y_train)
            X_test = ssc.transform(X_test)
            
            qwk, mae, pearson_score = accuracy_evaluation(model, X_test, y_test)
            fairness_part1_Gender, fairness_part2_Gender = fairness_evaluation(model, X_test, y_test, test_info['Gender'].to_numpy())
            fairness_part1_Economically_disadvantaged, fairness_part2_Economically_disadvantaged = fairness_evaluation(model, X_test, y_test, test_info['Economically_disadvantaged'].to_numpy())
            fairness_part1_Disability, fairness_part2_Disability = fairness_evaluation(model, X_test, y_test, test_info['Disability'].to_numpy())
            fairness_part1_English_Language_Learner, fairness_part2_English_Language_Learner = fairness_evaluation(model, X_test, y_test, test_info['English_Language_Learner'].to_numpy())
            fairness_part1_Race, fairness_part2_Race = fairness_evaluation(model, X_test, y_test, test_info['Race_Binary'].to_numpy())
            new_row = {"prompt" : i+1, "fold": k+1, "quadratic_weighted_kappa": qwk, "mean_absolute_error": mae, "pearson_correlation_coefficient": pearson_score,
                      "OSA_gender": fairness_part1_Gender['Overall score accuracy']['R2'],
                      "OSA_gender_p_value": fairness_part1_Gender['Overall score accuracy']['sig'],
                      "OSD_gender": fairness_part1_Gender['Overall score difference']['R2'],
                      "OSD_gender_p_value": fairness_part1_Gender['Overall score difference']['sig'],
                      "CSD_gender": fairness_part1_Gender['Conditional score difference']['R2'],
                      "CSD_gender_p_value": fairness_part1_Gender['Conditional score difference']['sig'],
                      "SMD_1_gender":fairness_part2_Gender['SMD_1'][0],
                      "SMD_0_gender":fairness_part2_Gender['SMD_0'][0],
                      "MAED_gender":fairness_part2_Gender['diff_mae'][0],
                      "OSA_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Overall score accuracy']['R2'],
                      "OSA_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Overall score accuracy']['sig'],
                      "OSD_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Overall score difference']['R2'],
                      "OSD_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Overall score difference']['sig'],
                      "CSD_Economically_disadvantaged": fairness_part1_Economically_disadvantaged['Conditional score difference']['R2'],
                      "CSD_Economically_disadvantaged_p_value": fairness_part1_Economically_disadvantaged['Conditional score difference']['sig'],
                      "SMD_1_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['SMD_1'][0],
                      "SMD_0_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['SMD_0'][0],
                      "MAED_Economically_disadvantaged":fairness_part2_Economically_disadvantaged['diff_mae'][0],
                      "OSA_Disability": fairness_part1_Disability['Overall score accuracy']['R2'],
                      "OSA_Disability_p_value": fairness_part1_Disability['Overall score accuracy']['sig'],
                      "OSD_Disability": fairness_part1_Disability['Overall score difference']['R2'],
                      "OSD_Disability_p_value": fairness_part1_Disability['Overall score difference']['sig'],
                      "CSD_Disability": fairness_part1_Disability['Conditional score difference']['R2'],
                      "CSD_Disability_p_value": fairness_part1_Disability['Conditional score difference']['sig'],
                      "SMD_1_Disability":fairness_part2_Disability['SMD_1'][0],
                      "SMD_0_Disability":fairness_part2_Disability['SMD_0'][0],
                      "MAED_Disability":fairness_part2_Disability['diff_mae'][0],
                      "OSA_English_Language_Learner": fairness_part1_English_Language_Learner['Overall score accuracy']['R2'],
                      "OSA_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Overall score accuracy']['sig'],
                      "OSD_English_Language_Learner": fairness_part1_English_Language_Learner['Overall score difference']['R2'],
                      "OSD_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Overall score difference']['sig'],
                      "CSD_English_Language_Learner": fairness_part1_English_Language_Learner['Conditional score difference']['R2'],
                      "CSD_English_Language_Learner_p_value": fairness_part1_English_Language_Learner['Conditional score difference']['sig'],
                      "SMD_1_English_Language_Learner":fairness_part2_English_Language_Learner['SMD_1'][0],
                      "SMD_0_English_Language_Learner":fairness_part2_English_Language_Learner['SMD_0'][0],
                      "MAED_English_Language_Learner":fairness_part2_English_Language_Learner['diff_mae'][0],
                      "OSA_Race": fairness_part1_Race['Overall score accuracy']['R2'],
                      "OSA_Race_p_value": fairness_part1_Race['Overall score accuracy']['sig'],
                      "OSD_Race": fairness_part1_Race['Overall score difference']['R2'],
                      "OSD_Race_p_value": fairness_part1_Race['Overall score difference']['sig'],
                      "CSD_Race": fairness_part1_Race['Conditional score difference']['R2'],
                      "CSD_Race_p_value": fairness_part1_Race['Conditional score difference']['sig'],
                      "SMD_1_Race":fairness_part2_Race['SMD_1'][0],
                      "SMD_0_Race":fairness_part2_Race['SMD_0'][0],
                      "MAED_Race":fairness_part2_Race['diff_mae'][0]}
            k += 1
            df = df.append(new_row, ignore_index=True)
    return df
            
            

In [None]:
prompts = load_data("")

In [None]:
result = cross_validation(prompts)

In [None]:
result.to_csv('', index=False)

In [None]:
result