Model taking BERT Token to feed into XGBoost Regressor with the Grammar column training replaced by the Grammar model (XGBoost on preprocessed grammar data using BERT)

# Import Packages

In [1]:
import pandas as pd
import numpy as np

# Import Data

In [2]:
train_dataset = pd.read_csv("./data/train.csv")

In [3]:
cohesion = train_dataset['cohesion']
syntax = train_dataset['syntax']
vocabulary = train_dataset['vocabulary']
phraseology = train_dataset['phraseology']
grammar = train_dataset['grammar']
conventions = train_dataset['conventions']

In [4]:
syntax = np.asarray(syntax)
cohesion = np.asarray(cohesion)
vocabulary = np.asarray(vocabulary)
phraseology = np.asarray(phraseology)
grammar = np.asarray(grammar)
conventions = np.asarray(conventions)

In [5]:
train_token = np.load("./data/train_BertToken.npy")

In [6]:
y_train = np.transpose(np.asarray([cohesion, syntax, vocabulary, phraseology, conventions]))

## MCRMSE of the Model

In [7]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.multioutput import MultiOutputRegressor



In [8]:
from sklearn.model_selection import train_test_split

In [9]:
import xgboost as xgb

## Training on cohesion, syntax, vocabulary, phraseology, conventions

In [10]:
train_train, train_test, y_train, y_test = train_test_split(train_token, y_train,
                                                       shuffle = True,
                                                       random_state = 440,
                                                       test_size = .2)

In [11]:
xgb_estimator = xgb.XGBRegressor(n_estimators=100,
                                max_depth = 2,
                                objective='reg:squarederror')
model = MultiOutputRegressor(xgb_estimator, n_jobs=3)

model.fit(train_train, y_train)
predicted = model.predict(train_test)

bsl = np.ones((len(y_test),1))*np.mean(y_train, axis=0)

rmses = MSE(y_test, predicted, squared=False, multioutput='raw_values')

baseline = MSE(y_test, bsl, squared=False, multioutput='raw_values')

In [12]:
print('MCRMSE for XGBoost on cohesion, syntax, vocabulary, phraseology, conventions is', rmses.mean())

MCRMSE for XGBoost on cohesion, syntax, vocabulary, phraseology, conventions is 0.6395944512416154


In [13]:
print('MCRMSE for Baseline on cohesion, syntax, vocabulary, phraseology, conventions is', baseline.mean())

MCRMSE for Baseline on cohesion, syntax, vocabulary, phraseology, conventions is 0.6515055628623492


## Training on grammar using preprocessed grammar data

In [14]:
train_grammar = pd.read_csv('./Grammar/grammar_train_comb.csv', index_col=0)

In [15]:
data_arr = np.asarray(train_grammar[['ratio_grammar_correct_sentences','sentence_number']])

In [16]:
target_arr = np.asarray(train_grammar['grammar_score'])

In [17]:
target_arr = np.reshape(target_arr, (target_arr.shape[0], 1))

In [18]:
grammar_train, grammar_test, grammar_y_train, grammar_y_test = train_test_split(data_arr, target_arr,
                                                       shuffle = True,
                                                       random_state = 440,
                                                       test_size = .2)

In [19]:
model = xgb.XGBRegressor(n_estimators=100,
                            max_depth = 2,
                            objective='reg:squarederror')

model.fit(grammar_train, grammar_y_train)
predicted = model.predict(grammar_test)

bsl = np.ones((len(grammar_y_test),1))*np.mean(grammar_y_train)

rmse_grammar = MSE(grammar_y_test, predicted, squared=False, multioutput='raw_values')

baseline_grammar = MSE(grammar_y_test, bsl, squared=False, multioutput='raw_values')

In [20]:
print('RMSE for XGBoost on Grammar is', rmse_grammar[0])

RMSE for XGBoost on Grammar is 0.5185646392718115


In [21]:
print('RMSE for Baseline on Grammar is', baseline_grammar[0])

RMSE for Baseline on Grammar is 0.6938332271124678


## Calculate final MCRMSE

In [22]:
print('MCRMSE for 6 columns is', np.concatenate((rmses,rmse_grammar)).mean())

MCRMSE for 6 columns is 0.6194228159133147


In [23]:
print('MCRMSE of Baseline is', np.concatenate((baseline,baseline_grammar)).mean())

MCRMSE of Baseline is 0.6585601735707022
