<a href="https://colab.research.google.com/github/gabriel-ab/ufrpe-answer-analyser/blob/main/notebooks/open_question_scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
from google.colab import drive
drive.mount('/content/drive');

In [2]:
# Parameters
DATASET = '/content/drive/MyDrive/Research/Projeto Correção de questão/Data/TexasDataset/EXCEL/texas_dataset.xlsx'
GLOVE_S50 = "/content/drive/MyDrive/Colab Notebooks/Embeddings/GloVe/glove.6B.50d.txt"
FASTTEXT_S50 = "/content/drive/MyDrive/Colab Notebooks/Embeddings/FastText/cc.en.50.txt"

GET_FASTTEXT_EMB = False
GET_WORD2VEC_EMB = True

RANDOM_STATE = 42
TEST_SIZE = 0.20
VAL_SIZE = 0.20
EMBEDDING_DIM = 50
LSTM_UNITS = 128
BATCH_SIZE = 128
EPOCHS = 11
MAX_LENGTH = 30
MAX_TOKENS = 20000

In [3]:
%%capture
!pip install -U spacy fasttext

In [4]:
import random
import re
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
import tqdm
import keras
import fasttext
import fasttext.util
import joblib
import tensorflow as tf
from tensorflow import keras
from tqdm.notebook import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, classification_report
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, FastText
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [5]:
%%capture
# Get FastText embedding for english language and reduce its dimension to 50

if GET_FASTTEXT_EMB:
  fasttext.util.download_model('en', if_exists='ignore')
  ft = fasttext.load_model("cc.en.300.bin")
  print(f"Current dimension: {ft.get_dimension()}")
  print("Reducing dimension...")
  fasttext.util.reduce_model(ft, 50)
  print("Done!")
  print(f"New dimension: {ft.get_dimension()}")
  ft.save_model("/content/drive/MyDrive/Embeddings/FastText/cc.en.50.bin")
  model = FastText.load_fasttext_format("/content/drive/MyDrive/Embeddings/FastText/cc.en.50.bin")
  model.wv.save_word2vec_format("/content/drive/MyDrive/Embeddings/FastText/cc.en.50.txt", binary=False)

In [6]:
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

In [7]:
tqdm_notebook.pandas()

# Preprocessing

In [8]:
columns_dict = {
  "Topic": "topic",
  "Human Evaluation": "human_evaluation",
  "Round_score": "human_evaluation",
  "Question": "question",
  "Student Answer": "answer",
  "Question_ID": "question_id",
  "Instructor answers": "intructor_answers",
  "Score": "score"
}
dataset = pd.read_excel(DATASET)
dataset.rename(columns=columns_dict, inplace=True)
dataset.head()

Unnamed: 0,ID,question_id,question,intructor_answers,answer,score,human_evaluation
0,1,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,High risk problems are address in the prototy...,3.5,4
1,2,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,To simulate portions of the desired final pro...,5.0,5
2,3,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,A prototype program simulates the behaviors o...,4.0,4
3,4,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,Defined in the Specification phase a prototyp...,5.0,5
4,5,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,It is used to let the users have a first idea...,3.0,3


In [9]:
dataset['answer'] = dataset['answer'].astype(str)
dataset['question'] = dataset['question'].astype(str)
dataset['question_id'] = dataset['question_id'].astype(str)
dataset.drop('score', inplace=True, axis=1)

## Text cleaning

In [10]:
nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def clean_text(text: str) -> str:
  # Remove punctuations and numbers
  text = re.sub(r'[^a-zA-Z]', ' ', text)

  # Remove single characters
  text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

  # Remove multiple spaces
  text = re.sub(r'\s+', ' ', text)

  return text.strip().lower()

def remove_stopwords(text: str) -> str:
  return ' '.join([token for token in text.split() if token not in stopwords])

In [12]:
dataset["input"] = dataset.progress_apply(lambda row: row["question"] + " " + row["answer"], axis=1)
dataset["input"] = dataset["input"].progress_apply(lambda x: clean_text(remove_stopwords(x)))

  0%|          | 0/2442 [00:00<?, ?it/s]

  0%|          | 0/2442 [00:00<?, ?it/s]

In [13]:
dataset.head(3)

Unnamed: 0,ID,question_id,question,intructor_answers,answer,human_evaluation,input
0,1,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,High risk problems are address in the prototy...,4,what role prototype program problem solving hi...
1,2,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,To simulate portions of the desired final pro...,5,what role prototype program problem solving to...
2,3,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,A prototype program simulates the behaviors o...,4,what role prototype program problem solving pr...


## Data spliting

In [14]:
X = dataset.drop("human_evaluation", axis=1)
y = dataset["human_evaluation"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, shuffle=True, random_state=RANDOM_STATE, stratify=y)
len(X_train), len(X_test), len(y_train), len(y_test)

(1953, 489, 1953, 489)

In [16]:
y_test.unique()

array([5, 4, 3, 1, 2, 0])

## Dealing with Unbalanced Data

In [17]:
dataset = pd.concat([X_train, y_train], axis=1)
dataset.head()

Unnamed: 0,ID,question_id,question,intructor_answers,answer,input,human_evaluation
962,963,2023-01-06,What is a pointer?,A variable that contains the address in memory...,It is like a variable however instead of holdi...,what pointer it like variable however instead ...,5
2194,2195,2023-02-12,What is the experimental approach for measurin...,Implement the algorithm and measure the physic...,Experimental means you would actually write a ...,what experimental approach measuring running t...,5
46,47,2023-02-01,What stages in the software life cycle are in...,The testing stage can influence both the codi...,"Depending on how the work is done, Testing is...",what stages software life cycle influenced tes...,2
1426,1427,2023-05-08,Which implementation (array-based vs. list-bas...,"Link-based, because they are dynamic (no size ...",Array-based prevents the push operation from ...,which implementation array based vs list based...,5
2441,2442,2023-10-12,How many steps does it take to search a node i...,The height of the tree.,it depends on the install search tree then fro...,how many steps take search node binary search ...,2


In [18]:
dataset["human_evaluation"].value_counts()

5    1239
4     329
3     232
2     113
1      21
0      19
Name: human_evaluation, dtype: int64

In [19]:
label_5_ds = dataset[dataset["human_evaluation"] == 5]
label_5_ds = pd.concat([label_5_ds] * 1, axis=0, ignore_index=True)
len(label_5_ds)

1239

In [20]:
label_4_ds = dataset[dataset["human_evaluation"] == 4]
label_4_ds = pd.concat([label_4_ds] * 4, axis=0, ignore_index=True)
len(label_4_ds)

1316

In [21]:
label_3_ds = dataset[dataset["human_evaluation"] == 3]
label_3_ds = pd.concat([label_3_ds] * 5, axis=0, ignore_index=True)
len(label_3_ds)

1160

In [22]:
label_2_ds = dataset[dataset["human_evaluation"] == 2]
label_2_ds = pd.concat([label_2_ds] * 10, axis=0, ignore_index=True)
len(label_2_ds)

1130

In [23]:
label_1_ds = dataset[dataset["human_evaluation"] == 1]
label_1_ds = pd.concat([label_1_ds] * 50, axis=0, ignore_index=True)
len(label_1_ds)

1050

In [24]:
label_0_ds = dataset[dataset["human_evaluation"] == 0]
label_0_ds = pd.concat([label_0_ds] * 53, axis=0, ignore_index=True)
len(label_0_ds)

1007

In [25]:
dataset = pd.concat([label_0_ds, label_1_ds, label_2_ds, label_3_ds, label_4_ds, label_5_ds], axis=0)
len(dataset)

6902

In [26]:
dataset = dataset.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
dataset.head()

Unnamed: 0,ID,question_id,question,intructor_answers,answer,input,human_evaluation
0,1223,2023-04-07,How are linked lists passed as arguments to a ...,By reference.,not answered,how linked lists passed arguments function ans...,0
1,1777,2023-04-10,What is a binary tree?,A tree for which the maximum number of childre...,A binary search tree is a tree that also has t...,what binary tree binary search tree tree also ...,5
2,2240,2023-03-12,Order the following functions by their running...,log(log n); 2^(log n) ; n^2 ; n^3; n!,longest to shortest:<br>n^3; n!; n^2; 2^(log n...,order following functions running time log log...,3
3,690,2023-02-04,What is the main difference between strings de...,The strings declared using an array of charact...,array it is the collection of similar data ty...,what main difference strings declared using ty...,3
4,2422,2023-10-12,How many steps does it take to search a node i...,The height of the tree.,2^n where n is the # of levels the binary tree...,how many steps take search node binary search ...,2


In [27]:
X_train = dataset.drop("human_evaluation", axis=1)
y_train = dataset["human_evaluation"]

# GloVe

## Tokenization and Embedding

In [28]:
embeddings_index = {}

with open(GLOVE_S50, encoding='utf8') as glove_file:
  for line in glove_file:
    try:
      records = line.split()
      word = records[0]
      vector_dimensions = np.asarray(records[1:], dtype='float32')
      embeddings_index[word] = vector_dimensions
    except:
      print(records)

In [29]:
text_vectorization = keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode='int',
    output_sequence_length=MAX_LENGTH
)

text_vectorization.adapt(X_train["input"])

In [30]:
text_vectorization_layer_model = tf.keras.models.Sequential()
text_vectorization_layer_model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
text_vectorization_layer_model.add(text_vectorization)
text_vectorization_layer_model.summary()
text_vectorization_layer_model.save("glove_text_vec_layer_model.keras")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVe  (None, 30)                0         
 ctorization)                                                    
                                                                 
Total params: 0 (0.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [31]:
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((MAX_TOKENS, EMBEDDING_DIM))
for word, i in word_index.items():
  if i < MAX_TOKENS:
    embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None and embedding_vector.shape[0] != 51:
    embedding_matrix[i] = embedding_vector

## Training

In [32]:
def build_model():
    embedding_layer = keras.layers.Embedding(
        MAX_TOKENS,
        EMBEDDING_DIM,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
        mask_zero=True
    )
    inputs = keras.Input(shape=(MAX_LENGTH,), dtype='int32')
    embedded = embedding_layer(inputs)
    x = keras.layers.Bidirectional(keras.layers.LSTM(32))(embedded)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(64, activation='relu')(x)
    x = keras.layers.Dropout(0.2)(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    model.summary()
    return model

In [33]:
callbacks = [
  keras.callbacks.ModelCheckpoint("glove_oq_scorer_model.keras", save_best_only=True)
]

int_train_ds = text_vectorization(X_train['input'])
train_targets = tf.convert_to_tensor(y_train)

int_test_ds = text_vectorization(X_test['input'])
test_targets = tf.convert_to_tensor(y_test)

model = build_model()
model.fit(x=int_train_ds, y=train_targets, validation_split=VAL_SIZE, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=callbacks)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 30)]              0         
                                                                 
 embedding (Embedding)       (None, 30, 50)            1000000   
                                                                 
 bidirectional (Bidirection  (None, 64)                21248     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                             

<keras.src.callbacks.History at 0x79e0c41a3040>

In [34]:
y_pred = model.predict(int_test_ds)
y_pred = np.argmax(y_pred, axis=1)
cohen_kappa_score(y_test, y_pred)



0.2708715445008908

In [35]:
y_pred = model.predict(int_test_ds)
y_pred = np.argmax(y_pred, axis=1)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       1.00      0.40      0.57         5
           2       0.26      0.34      0.30        29
           3       0.27      0.45      0.34        58
           4       0.27      0.39      0.32        82
           5       0.82      0.61      0.70       310

    accuracy                           0.54       489
   macro avg       0.56      0.53      0.51       489
weighted avg       0.63      0.54      0.57       489



# FastText

## Tokenization and Embedding

In [36]:
embeddings_index = {}

with open(FASTTEXT_S50, encoding='utf8') as glove_file:
  for line in glove_file:
    try:
      records = line.split()
      word = records[0]
      vector_dimensions = np.asarray(records[1:], dtype='float32')
      embeddings_index[word] = vector_dimensions
    except:
      print(records)

In [37]:
text_vectorization = keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode='int',
    output_sequence_length=MAX_LENGTH
)

text_vectorization.adapt(X_train["input"])

In [38]:
text_vectorization_layer_model = tf.keras.models.Sequential()
text_vectorization_layer_model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
text_vectorization_layer_model.add(text_vectorization)
text_vectorization_layer_model.summary()
text_vectorization_layer_model.save("fasttext_text_vec_layer_model.keras")

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (Text  (None, 30)                0         
 Vectorization)                                                  
                                                                 
Total params: 0 (0.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [39]:
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((MAX_TOKENS, EMBEDDING_DIM))
for word, i in word_index.items():
  if i < MAX_TOKENS:
    embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None and embedding_vector.shape[0] != 51:
    embedding_matrix[i] = embedding_vector

## Training

In [40]:
callbacks = [
  keras.callbacks.ModelCheckpoint("fasttext_oq_scorer_model.keras", save_best_only=True)
]

int_train_ds = text_vectorization(X_train['input'])
train_targets = tf.convert_to_tensor(y_train)

int_test_ds = text_vectorization(X_test['input'])
test_targets = tf.convert_to_tensor(y_test)

model = build_model()
model.fit(x=int_train_ds, y=train_targets, validation_split=VAL_SIZE, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=callbacks)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 30)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 30, 50)            1000000   
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                21248     
 onal)                                                           
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                           

<keras.src.callbacks.History at 0x79e0b33c58d0>

In [41]:
y_pred = model.predict(int_test_ds)
y_pred = np.argmax(y_pred, axis=1)
cohen_kappa_score(y_test, y_pred)



0.07330002242080658

In [42]:
y_pred = model.predict(int_test_ds)
y_pred = np.argmax(y_pred, axis=1)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.33      0.80      0.47         5
           1       0.15      0.40      0.22         5
           2       0.13      0.48      0.20        29
           3       0.14      0.17      0.15        58
           4       0.19      0.43      0.26        82
           5       0.74      0.22      0.34       310

    accuracy                           0.27       489
   macro avg       0.28      0.42      0.27       489
weighted avg       0.53      0.27      0.30       489

