# INSTALL LIBRARIES & IMPORT DEPENDENCIES

In [None]:
import importlib

def install_package(package_name):
    try:
        importlib.import_module(package_name)
        print(f"{package_name} is already installed.")
    except ImportError:
        !pip install {package_name}

install_package("textstat")
install_package("langdetect")
install_package('pyspellchecker')
install_package("scikeras")
install_package("transformers")
install_package('tensorflow_hub')
install_package('tensorflow_text')
install_package('wget')

In [None]:
# Import Data Handling Libraries
import pandas as pd
import numpy as np

# Import Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Natural Language Processing Libraries
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Import Text Analysis and Preprocessing Libraries
import re
from collections import defaultdict
import textstat
from spellchecker import SpellChecker

# Import Machine Learning and Deep Learning Libraries
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, GRU, Flatten, Dense,
    BatchNormalization, Dropout, Concatenate, Lambda, Bidirectional
)
from tensorflow.keras.utils import to_categorical
from keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier


# Import Machine Learning Libraries for Text Processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, confusion_matrix
from scipy.sparse import csr_matrix, save_npz, load_npz

import os
import wget

# IMPORT DATASET

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/andrealolli13/Text-Mining-and-Natural-Language-Processing/main/ielts_writing_dataset.csv')
# DATASET: IELTS Writing Scored Essays Dataset
# LINK: https://www.kaggle.com/datasets/mazlumi/ielts-writing-scored-essays-dataset

# display dataset
print('IELTS Writing Scored Essays Dataset:\n')
display(data.head())

# display dataset infos
print('\nDataset Informations:\n')
display(data.info())

# print percentage missing values
print('\nPercentage Missing Values per Column:\n')
for col, val in (data.isna().sum().items()):
  print(f"Missing values in {col} = {round((val/data.shape[0])*100,2)}%")

# remove columns with missing values
df = data.copy()
df.drop(columns=['Examiner_Commen', 'Task_Response', 'Coherence_Cohesion', 'Lexical_Resource', 'Range_Accuracy'], inplace=True)

# display new dataset
print('\nIELTS Writing Scored Essays Dataset NEW:\n')
display(df.head())

# DATA EXPLORATION

In [None]:
# extract values
TASK_TYPE = df.Task_Type
QUESTIONS = df.Question
ESSAYS = df.Essay
OVERALL = df.Overall

## CATEGORICAL DATA:

### TASK TYPE

In [None]:
# occurrences of each unique value in the "Task_Type" column
task_type_counts = TASK_TYPE.value_counts()

# labels and sizes for the pie chart
labels = task_type_counts.index.tolist()
sizes = task_type_counts.values.tolist()
colors = sns.color_palette('Blues')
explode = (.1, 0)


plt.figure(figsize=(3, 3))
plt.pie(sizes,labels=labels, colors=colors, autopct='%.1f %%', explode=explode, shadow=True,
        startangle=140)

plt.title('Task Type Distribution')
plt.axis('equal')
plt.show()

### OVERALL

In [None]:
sns.set(style="whitegrid")

plt.figure(figsize=(8, 6))
sns.countplot(x='Overall', data=df, color='skyblue')

plt.xlabel('OVERALL')
plt.ylabel('Count')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('Distribution of OVERALL Classes')
plt.show()

In [None]:
from IPython.display import Image
image_url = 'https://www.ielts.org/-/media/images/resources/cefr-ielts-300px.ashx?la=en&hash=B8E1C54B853D375FD4E4E1EF6FF8867002477A51'
Image(url=image_url)

In [None]:
# OVERALL CONVERSION IELTS to CEFR (Common European Frameword of Refernce)
CEFR_OVERALL = []

for point in OVERALL:
    if point >= 8.0: # C2
        CEFR_OVERALL.append('C2')

    if point >= 6.5 and point < 8.0: # C1
        CEFR_OVERALL.append('C1')

    if point >= 5.0 and point < 6.5: # B2
        CEFR_OVERALL.append('B2')

    if point >= 4.0 and point < 5.0: # B1
        CEFR_OVERALL.append('B1')

    if point < 4.0 : # BASIC (A1 or A2)
        CEFR_OVERALL.append('A')

CEFR_OVERALL = np.array(CEFR_OVERALL)
CEFR_OVERALL

In [None]:
sns.set(style="whitegrid")

plt.figure(figsize=(8, 6))
sns.countplot(x=CEFR_OVERALL, color='skyblue', order=['A', 'B1', 'B2', 'C1', 'C2'])

plt.xlabel('OVERALL')
plt.ylabel('Count')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('Distribution of OVERALL Classes')
plt.show()

# NEW FEATURES (ESSAY-BASED)

In [None]:
# to avoid counting symbols as unique letter we are going to create a function to remove them.
def remove_symbols(input_string):

    pattern = r'[!@#$%^&*()_+{}\[\]:;<>,.?/~\\|-]'
    cleaned_string = re.sub(pattern, ' ', input_string)

    return cleaned_string

## NUMBER OF MISSING WORDS TO MEET LENGTH REQUIREMENTS
Test-takers are expected to write an essay that is at least 250 words in length.
        Writing less than 250 words may result in a penalty to your score

In [None]:
MISSING_WORDS = []
MINIMUM_WORDS = 250

for essay in ESSAYS:

    cleaned_essay = remove_symbols(essay)  # remove punctuation

    number_of_words = len(cleaned_essay.split())
    if number_of_words < 250:
        number_of_missing_words = MINIMUM_WORDS - number_of_words
    else:
        number_of_missing_words = 0

    MISSING_WORDS.append(number_of_missing_words)

MISSING_WORDS = np.array(MISSING_WORDS)
MISSING_WORDS

## MEAN NUMBER OF WORDS PER SENTENCE

In [None]:
MEAN_SENTENCE_LENGTH = []

for essay in ESSAYS:

    sentences = essay.split('.') # divide essay by sentence
    sentences_length = [] # store sentences length

    for sentence in sentences:
        cleaned_sentence = remove_symbols(sentence) # remove punctuation
        words = cleaned_sentence.split()

        # skip unwanted white spaces counted as sentences
        if len(words) == 0:
            continue

        sentences_length.append(len(words)) # store sentence length

    # compute mean length rounded to integer
    sentences_mean_length = int(sum(sentences_length) / len(sentences_length))
    MEAN_SENTENCE_LENGTH.append(sentences_mean_length)

MEAN_SENTENCE_LENGTH = np.array(MEAN_SENTENCE_LENGTH)
MEAN_SENTENCE_LENGTH

## VOCABULARY RICHNESS
Count how many unique words are used. (We apply the following preprocessing techniques: Lowercasing, Symbols Removal, Stemming)

In [None]:
UNIQUE_WORDS = []
stemmer = PorterStemmer()

for essay in ESSAYS:

    lower_essay = essay.lower() # lowering
    cleaned_essay = remove_symbols(lower_essay) # punctuation removal
    words = cleaned_essay.split() #split
    stemmed_words = [stemmer.stem(word) for word in words] # Stem each word in the essay

    number_unique_words = len(list(set(stemmed_words)))
    UNIQUE_WORDS.append(number_unique_words)

UNIQUE_WORDS = np.array(UNIQUE_WORDS)
UNIQUE_WORDS

## READABILITY SCORES
**Flesch-Kincaid Grade Level:**
<code>0.39 * (average sentence length) + 11.8 * (average syllables per word) - 15.59
</code>
- Assesses text readability considering sentence length and word syllables.
- Scores are represented as U.S. school grades, with 8.0 indicating eighth-grade readability.
- Widely used for educational materials to ensure comprehension by specific grade levels.
- Range between: (0, 20.0)

**Gunning Fog Index:**
<code>0.4 * [(average sentence length) + (percentage of complex words)]
</code>
- Measures text complexity by examining sentence length and the presence of complex words (those with three or more syllables).
- Like Flesch-Kincaid, it reports scores in U.S. school grades.
- Especially useful for evaluating technical, legal, or scientific documents, as it focuses on vocabulary complexity alongside sentence structure.
- Range between: (0, 20.0)


**Key Differences:**
- Complex Words: The primary difference is that the Gunning Fog Index explicitly considers complex words (those with three or more syllables), while the Flesch-Kincaid Grade Level does not directly account for word complexity.
- Formulas: The formulas used for calculation are different, although they both rely on sentence length as a factor.
- Applications: Both metrics are suitable for assessing readability, but the choice between them may depend on the specific context and the type of text being analyzed. The Gunning Fog Index may be more suitable when you want to pay particular attention to complex vocabulary.



In [None]:
FK_GRADE_LEVEL = []
GUNNING_FOG_INDEX = []

for essay in ESSAYS:

    fk_grade_level = textstat.flesch_kincaid_grade(essay) # calculate Flesch-Kincaid grade level

    gunning_fog_index = textstat.gunning_fog(essay) # calculate Gunning fog index

    FK_GRADE_LEVEL.append(fk_grade_level)
    GUNNING_FOG_INDEX.append(gunning_fog_index)

FK_GRADE_LEVEL = np.array(FK_GRADE_LEVEL)
GUNNING_FOG_INDEX = np.array(GUNNING_FOG_INDEX)

FK_GRADE_LEVEL, GUNNING_FOG_INDEX

## USE OF TRANSITIONAL WORDS

In [None]:
# Additive Transitions
additive_transitions = ['in all honesty', 'as well as this', 'much less', 'indeed', 'nor',
       'on the other hand', 'to tell the truth', 'to say nothing of',
       'furthermore', 'and', 'besides this', 'in addition to this',
       'alternatively', 'either', 'in the first place', 'actually', 'or',
       'let alone', 'additionally', 'not only this but also that as well',
       'too', 'as a matter of fact', 'in fact', 'moreover', 'further',
       'not to mention this', 'what is more']

# Adversative Transitions
adversative_transitions = ['whatever happens', 'yet', 'though', 'in either case',
       'on the other hand', 'nevertheless', 'above all', 'but',
       'at least', 'even more', 'whichever happens', 'in contrast',
       'but even so', 'still', 'although', 'while', 'in either event',
       'however', 'conversely', 'either way', 'whereas']

# Causal Transitions
causal_transitions = ['in the event', 'and so', 'as a result', 'with this intention',
       'as a consequence', 'for this reason', 'with this in mind',
       'that being the case', 'so', 'in consequence', 'then', 'therefore',
       'because', 'so much that', 'under those circumstances',
       'consequently', 'hence', 'for the purpose of', 'thus',
       'accordingly', 'on the condition', 'granting']

# Sequential Transitions
sequential_transitions = ['by the way', 'initially', 'anyhow', 'in sum', 'in the place',
       'next', 'to conclude with', 'so', 'in short', 'subsequently',
       'to start with', 'to change the topic', 'to begin with',
       'afterward', 'after this', 'secondly', 'as was previously stated',
       'before', 'as a final point', 'last but not least', 'finally',
       'thus', 'to get back to the point', 'to resume', 'incidentally']

In [None]:
ADDITIVE_TRAN = []
ADVERSATIVE_TRAN = []
CAUSAL_TRAN = []
SEQUENTIAL_TRAN = []

for essay in ESSAYS:

    lower_essay = essay.lower()
    cleaned_essay = remove_symbols(lower_essay)

    additive_count = 0
    adversative_count = 0
    causal_count = 0
    sequential_count = 0

    # additive loop
    for trans in additive_transitions:
        if trans in cleaned_essay: additive_count +=1
    # adversative loop
    for trans in adversative_transitions:
        if trans in cleaned_essay: adversative_count += 1
    # causal_count
    for trans in causal_transitions:
        if trans in cleaned_essay: causal_count += 1
    # sequential count
    for trans in sequential_transitions:
        if trans in cleaned_essay: sequential_count += 1

    ADDITIVE_TRAN.append(additive_count)
    ADVERSATIVE_TRAN.append(adversative_count)
    CAUSAL_TRAN.append(causal_count)
    SEQUENTIAL_TRAN.append(sequential_count)

ADDITIVE_TRAN = np.array(ADDITIVE_TRAN)
ADVERSATIVE_TRAN = np.array(ADVERSATIVE_TRAN)
CAUSAL_TRAN = np.array(CAUSAL_TRAN)
SEQUENTIAL_TRAN = np.array(SEQUENTIAL_TRAN)

ADDITIVE_TRAN, ADVERSATIVE_TRAN, CAUSAL_TRAN, SEQUENTIAL_TRAN

## GRAMMAR AND SPELLING ERRORS

In [None]:
GRAMMAR_SPELLING_ERRORS = []
spell_checker_gb = SpellChecker(language='en')

for essay in ESSAYS:

    lower_essay = essay.lower()
    cleaned_essay = remove_symbols(lower_essay)
    words = cleaned_essay.split()

    # check for spelling errors in British English
    misspelled_gb = spell_checker_gb.unknown(words)

    errors = (len(misspelled_gb))
    GRAMMAR_SPELLING_ERRORS.append(errors)

GRAMMAR_SPELLING_ERRORS = np.array(GRAMMAR_SPELLING_ERRORS)
GRAMMAR_SPELLING_ERRORS

# NEW DATASET

In [None]:
data = {
    'Task_Type' : TASK_TYPE,
    'Question' : QUESTIONS,
    'Essay' : ESSAYS,
    'Missing_Words' : MISSING_WORDS,
    'Mean_Sentence_Length': MEAN_SENTENCE_LENGTH,
    'Vocabulary_Richness' : UNIQUE_WORDS,
    'FK_Grade_Level' : FK_GRADE_LEVEL,
    'Gunning_Fog_Index' : GUNNING_FOG_INDEX,
    'Additive_Transitions' : ADDITIVE_TRAN,
    'Adversative_Transitions' : ADVERSATIVE_TRAN,
    'Causal_Transitions' : CAUSAL_TRAN,
    'Sequential_Transitions' : SEQUENTIAL_TRAN,
    'Grammar_Spelling_Errors' : GRAMMAR_SPELLING_ERRORS,
    'CEFR_Overall' : CEFR_OVERALL
}

new_df = pd.DataFrame(data)
new_df.head()

# DATA PREPROCESSING

## NUMERICAL DATA

In [None]:
def standardize_array(arr):

    # calculate the mean and standard deviation
    mean = np.mean(arr)
    std_dev = np.std(arr)

    # standardize the data
    standardized_data = (arr - mean) / std_dev

    return standardized_data

In [None]:
# apply standardization to floating-point type arrays

STD_MISSING_WORDS=standardize_array(MISSING_WORDS) # missing Words

STD_MEAN_SENTENCE_LENGTH=standardize_array(MEAN_SENTENCE_LENGTH) # sentence Length

STD_UNIQUE_WORDS=standardize_array(UNIQUE_WORDS) # unique words

STD_FK_GRADE_LEVEL=standardize_array(FK_GRADE_LEVEL) # Flesch-Kincaid grade level

STD_GUNNING_FOG_INDEX=standardize_array(GUNNING_FOG_INDEX) # Gunning Fog index

STD_GRAMMAR_SPELLING_ERRORS=standardize_array(GRAMMAR_SPELLING_ERRORS) # grammar and spelling errors

## TEXTUAL DATA

In [None]:
# join questions and essays
CORPUS = []
for q,e in zip(QUESTIONS, ESSAYS):
    CORPUS.append(q + ' ' + e)

In [None]:
# preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def text_preprocessing(data):
    preprocessed_data = []

    for TEXT in data:
        # remove special characters, punctuation marks, and numbers
        CLEANED_TEXT = re.sub(r'[^a-zA-Z\s]', ' ', TEXT)
        # convert text to lowercase
        LOWERCASE_TEXT = CLEANED_TEXT.lower()
        # tokenize the text
        TOKENS = word_tokenize(LOWERCASE_TEXT)
        # remove stopwords from the list of tokens
        FILTERED_TOKENS = [word for word in TOKENS if word not in stop_words]
        # lemmatize each word
        LEMMATIZED_TOKENS = [lemmatizer.lemmatize(word) for word in FILTERED_TOKENS]
        # remove single-letter words
        FINAL_TOKENS = [word for word in LEMMATIZED_TOKENS if len(word) > 1]

        preprocessed_data.append(FINAL_TOKENS)

    return preprocessed_data

In [None]:
PREP_CORPUS = text_preprocessing(CORPUS)
print(f"BEFORE PREPROCESSING.\n\nCORPUS:\n{CORPUS[0]}")
print(f"\n- - - - - - - - \n")
print(f"AFTER PREPROCESSING.\n\nCORPUS:\n{PREP_CORPUS[0]}")

In [None]:
# max and mean lenght of CORPUS
MAX_LENGTH_CORPUS = max(len(c.split()) for c in CORPUS)
MEAN_LENGTH_CORPUS = sum(len(c.split()) for c in CORPUS) / len(CORPUS)

# max and mean lenght of CORPUS after
MAX_LENGTH_PREP_CORPUS = max(len(c) for c in PREP_CORPUS)
MENA_LENGTH_PREP_CORPUS = sum(len(c) for c in PREP_CORPUS) / len(PREP_CORPUS)

# labels for the bars
labels = ['Max Length', 'Mean Length']

# values for the bars
corpus_values = [MAX_LENGTH_CORPUS, MEAN_LENGTH_CORPUS]
prep_corpus_values = [MAX_LENGTH_PREP_CORPUS, MENA_LENGTH_PREP_CORPUS]

# x-axis positions for the bars
x = range(len(labels))

# create bar plots
fig, ax = plt.subplots()
bar1 = ax.bar(x, corpus_values, width = 0.35, label='CORPUS')
bar2 = ax.bar([i + 0.35 for i in x], prep_corpus_values, width = 0.35, label='PREP_CORPUS')

# add labels, title, and legend
ax.set_xlabel('Statistics')
ax.set_ylabel('Length')
ax.set_title('Max and Mean Lengths of CORPUS and PREP_CORPUS')
ax.set_xticks([i + 0.35 / 2 for i in x])
ax.set_xticklabels(labels)
ax.legend()

# add values on top of the bars
for bar in bar1 + bar2:
    height = bar.get_height()
    ax.annotate(f'{height:.1f}',  # Format the value to 2 decimal places
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords='offset points',
                ha='center', va='bottom')

# display the plot
plt.show()

In [None]:
VOCABULARY = []
for prep_essay in PREP_CORPUS:
    for word in prep_essay:
        VOCABULARY.append(word)

VOCABULARY = np.array(list(set(VOCABULARY)))

word_to_int = {w:i for i, w in enumerate(VOCABULARY)}
word_to_int['<UNK>'] = 11750
int_to_word = {i:w for w,i in word_to_int.items()}

In [None]:
# fix corpus length to 314 characters
EQUAL_LENGTH_CORP = []

for prep_essay in PREP_CORPUS:

    temp = []
    for word in prep_essay:
        temp.append(word)

    while len(temp) < 314:
        temp.append('<UNK>')

    EQUAL_LENGTH_CORP.append(temp)

# VECTOR REPRESENTATION

## PPMI MATRIX

In [None]:
def compute_ppmi_matrix(tokens, vocab=word_to_int):

    # initialize a matrix to store co-occurrence counts
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))

    # count co-occurrences of words within a certain window
    window_size = 3
    for sentence in tokens:
        for i, target_word in enumerate(sentence):
            target_idx = vocab[target_word]
            start = max(0, i - window_size)
            end = min(len(sentence), i + window_size + 1)
            context_words = [sentence[j] for j in range(start, end) if j != i]
            for context_word in context_words:
                context_idx = vocab[context_word]
                co_occurrence_matrix[target_idx][context_idx] += 1

    # convert the co-occurrence matrix to a sparse CSR matrix for memory efficiency
    co_occurrence_matrix = csr_matrix(co_occurrence_matrix)

    # compute PPMI matrix
    sum_rows = np.array(co_occurrence_matrix.sum(axis=1)).flatten()
    sum_cols = np.array(co_occurrence_matrix.sum(axis=0)).flatten()
    total_sum = sum_rows.sum()

    # avoid division by zero and compute PMI
    nonzero_rows, nonzero_cols = co_occurrence_matrix.nonzero()
    pmi_matrix = np.zeros_like(co_occurrence_matrix.toarray(), dtype=np.float64)
    for i, j in zip(nonzero_rows, nonzero_cols):
        pmi = np.log((co_occurrence_matrix[i, j] * total_sum) / (sum_rows[i] * sum_cols[j]))
        pmi_matrix[i, j] = max(pmi, 0)  # apply PPMI transformation

    return pmi_matrix


ppmi_matrix = compute_ppmi_matrix(PREP_CORPUS)
print(ppmi_matrix)
print(ppmi_matrix.shape)

## TF-IDF MATRIX

In [None]:
CORP = [' '.join(c) for c in PREP_CORPUS]

vectorizer = TfidfVectorizer(stop_words=None, min_df=0, max_df=1.0)
tfidf_matrix = vectorizer.fit_transform(CORP) # Compute TF-IDF matrix
feature_names = vectorizer.get_feature_names_out() # Get the vocabulary (unique words) as feature names
dense_tfidf_matrix = tfidf_matrix.toarray()

print("TF-IDF Matrix:")
print(dense_tfidf_matrix)
print(dense_tfidf_matrix.shape)

print("Feature Names (Vocabulary):")
print(feature_names)

In [None]:
num_rows = dense_tfidf_matrix.shape[0]
zeros_column = np.zeros((num_rows, 1)) # '<UNK>' token column
# concatenate the zeros column to the existing TF-IDF matrix
dense_tfidf_matrix_with_zeros = np.hstack((dense_tfidf_matrix, zeros_column))

print("Updated TF-IDF Matrix Shape:")
print(dense_tfidf_matrix_with_zeros.T.shape)

## GLOVE EMBEDDINGS

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
def load_glove_embeddings(file_path):
    word_embeddings = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            word_embeddings[word] = vector

    return word_embeddings

# glove_embeddings_300d = load_glove_embeddings("/content/drive/MyDrive/Colab Notebooks/glove.6B.300d.txt")
glove_embeddings_300d = load_glove_embeddings("glove.6B.300d.txt")

In [None]:
def create_embedding_matrix(glove_embeddings, embedding_dim, vocab_size = len(word_to_int)):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_to_int.items():
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

glove_emb_300d = create_embedding_matrix(glove_embeddings_300d, 300)

# FINAL DATASET

In [None]:
CORP_TO_INT = []

for _ in EQUAL_LENGTH_CORP:

    integer_list = []
    for word in _ :
        integer_list.append(word_to_int[word])

    CORP_TO_INT.append(integer_list)

data = CORP_TO_INT

numerical = np.concatenate(
    (TASK_TYPE.values.reshape(1, -1), STD_MISSING_WORDS.reshape(1, -1), STD_UNIQUE_WORDS.reshape(1, -1),
    STD_FK_GRADE_LEVEL.reshape(1, -1), STD_GUNNING_FOG_INDEX.reshape(1, -1), ADDITIVE_TRAN.reshape(1, -1),
    ADVERSATIVE_TRAN.reshape(1, -1), CAUSAL_TRAN.reshape(1, -1), SEQUENTIAL_TRAN.reshape(1, -1),
    STD_GRAMMAR_SPELLING_ERRORS.reshape(1, -1)))

numerical = numerical.T
print(f"Numerical Dataset shape:{numerical.shape}")

In [None]:
# join textual and numerical data
for i, _ in enumerate(numerical, start = 0):
    for value in _:
        data[i].append(value)

In [None]:
# convert label into one-hot representation
label_encoder = LabelEncoder()
num_classes = len(np.unique(CEFR_OVERALL))

y = to_categorical(label_encoder.fit_transform(CEFR_OVERALL), num_classes)
X = np.array(data)

X.shape, y.shape

## TRAIN, TEST & VALIDATION SETS

In [None]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)

# split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# MODEL

In [None]:
def create_model(input_shape=None,input_length=None,output_shape=None, embedding_matrix=None,
                learning_rate=10**-1, loss='categorical_crossentropy', optimizer='adam',
                gru_1_units=15, gru_2_units=15, hid_1_units=32, hid_2_units=64,
                gru_activation='tanh', hid_activation='relu', out_activation='softmax',
                kernel_initializer='glorot_uniform', kernel_regularizer='l1', dropout_rate=0.1):

    input_layer = Input(shape=input_shape)

    text_input = Lambda(lambda x: x[:,:315])(input_layer)
    numerical_input = Lambda(lambda x: x[:,315:])(input_layer)

    embedding = Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=input_length,
        trainable=False)(text_input)

    b_lstm_1 = Bidirectional(GRU(
        units=gru_1_units,
        activation=gru_activation,
        kernel_initializer=kernel_initializer,
        kernel_regularizer=kernel_regularizer,
        dropout=dropout_rate,
        return_sequences=True))(embedding)

    b_lstm_2 = Bidirectional(GRU(
        units=gru_2_units,
        activation=gru_activation,
        kernel_initializer=kernel_initializer,
        kernel_regularizer=kernel_regularizer,
        dropout=dropout_rate,
        return_sequences=True))(b_lstm_1)

    flatten = Flatten()(b_lstm_2)

    concat = Concatenate()([flatten, numerical_input])

    dense_1 = Dense(
        units=hid_1_units,
        activation=hid_activation,
        kernel_initializer=kernel_initializer,
        kernel_regularizer=kernel_regularizer)(concat)

    bn_1 = BatchNormalization()(dense_1)
    dropout_1 = Dropout(dropout_rate)(bn_1)

    dense_2 = Dense(
        units=hid_2_units,
        activation=hid_activation,
        kernel_initializer=kernel_initializer,
        kernel_regularizer=kernel_regularizer)(dropout_1)

    bn_2 = BatchNormalization()(dense_2)
    dropout_2 = Dropout(dropout_rate)(bn_2)

    output_layer = Dense(
        units = 5,
        activation = out_activation
    )(dropout_2)

    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        loss=loss,
        optimizer=optimizer,
        metrics=['accuracy']
    )

    return model

# HYPER-PARAMETER TUNING

## RNN ACTIVATION FUNCTION

In [None]:
model = KerasClassifier(
    model=create_model,
    input_shape=(324,),
    input_length=314,
    embedding_matrix=glove_emb_300d,
    epochs=3)

activation_functions = ['relu', 'sigmoid', 'tanh']
param_grid = dict(model__gru_activation=activation_functions)

GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

## NN ACTIVATION FUNCTION

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh',
    epochs=3)

activation_functions = ['relu', 'sigmoid', 'tanh']
param_grid = dict(model__hid_activation=activation_functions)

GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

RNN UNITS

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='tanh',
    epochs=2)

gru_1_units = [16, 32]
gru_2_units = [16, 32]
param_grid = dict(model__gru_1_units = gru_1_units,
                  model__gru_2_units = gru_2_units)

GS = GridSearchCV(estimator = model,param_grid = param_grid,
    scoring = 'accuracy',n_jobs=-1,cv = 2,verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_,
                             grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

## NN UNITS

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='tanh', gru_1_units=32, gru_2_units=32,
    epochs=2)


hid_1_units = [32, 64]
hid_2_units = [32, 64]
param_grid = dict(model__hid_1_units = hid_1_units,
                  model__hid_2_units = hid_2_units)

GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

## LEARNING RATE

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='tanh', gru_1_units=32, gru_2_units=32, hid_1_units=64, hid_2_units=32,
    epochs=3)

lr = [10**-3, 10**-2, 10**-1]
param_grid = dict(model__learning_rate= lr)


GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

## DROPOUT RATE

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='tanh', gru_1_units=32, gru_2_units=32, hid_1_units=64, hid_2_units=32,
    learning_rate=10**-2,
    epochs=3)

dropout_rates = [0.0, 0.1 ,0.2]
param_grid = dict(model__dropout_rate=dropout_rates)

GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

## KERNEL REGULARIZER

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='tanh', gru_1_units=32, gru_2_units=32, hid_1_units=64, hid_2_units=32,
    learning_rate=10**-2, dropout_rate=0.2,
    epochs=3)

kernel_regularizer = [None, 'l1', 'l2', 'l1_l2']
param_grid = dict(model__kernel_regularizer=kernel_regularizer)

GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

## KERNEL INITIALIZER

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='tanh', gru_1_units=32, gru_2_units=32, hid_1_units=64, hid_2_units=32,
    learning_rate=10**-2, dropout_rate=0.2, kernel_regularizer=None,
    epochs=3)

kernel_initializers = ['glorot_uniform', 'he_normal']
param_grid = dict(model__kernel_initializer=kernel_initializers)


GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

## OPTIMIZER

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='relu', gru_1_units=32, gru_2_units=32, hid_1_units=64, hid_2_units=32,
    learning_rate=10**-2, dropout_rate=0.2, kernel_regularizer=None, kernel_initializer='glorot_uniform',
    epochs=3)

optimizers = ['adam', 'sgd']
param_grid = dict(model__optimizer=optimizers)

GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

## BATCH SIZE

In [None]:
model = KerasClassifier(
    model=create_model, input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='tanh', gru_1_units=32, gru_2_units=32, hid_1_units=64, hid_2_units=32,
    learning_rate=10**-2, dropout_rate=0.2, kernel_regularizer=None, kernel_initializer='glorot_uniform',
    optimizer='adam',
    epochs=2)

batch_size = [64, None, 16, 32]
param_grid = dict(batch_size=batch_size)

GS = GridSearchCV(
    estimator = model, param_grid = param_grid, scoring = 'accuracy',
    n_jobs=-1, cv = 2, verbose=1)

grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

# MODEL EVALUATION

In [None]:
GLOVE_model = create_model(
    input_shape=(324,), input_length=314, embedding_matrix=glove_emb_300d,
    gru_activation='tanh', hid_activation='relu', gru_1_units=16, gru_2_units=16, hid_1_units=32, hid_2_units=32,
    learning_rate=10**-3, dropout_rate=0.2, kernel_regularizer='l2', kernel_initializer='glorot_uniform',
    optimizer='adam'
)

GLOVE_history = GLOVE_model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=15, shuffle=True)

In [None]:
TF_IDF_model = create_model(
    input_shape=(324,), input_length=314, embedding_matrix=dense_tfidf_matrix_with_zeros.T,
    gru_activation='tanh', hid_activation='tanh', gru_1_units=16, gru_2_units=16, hid_1_units=32, hid_2_units=32,
    learning_rate=10**-3, dropout_rate=0.2, kernel_regularizer='l2', kernel_initializer='glorot_uniform',
    optimizer='adam'
)

TF_IDF_history = TF_IDF_model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=15, shuffle=True)

In [None]:
PPMI_model = create_model(
    input_shape=(324,), input_length=314, embedding_matrix=ppmi_matrix,
    gru_activation='tanh', hid_activation='tanh', gru_1_units=16, gru_2_units=16, hid_1_units=32, hid_2_units=32,
    learning_rate=10**-3, dropout_rate=0.2, kernel_regularizer='l2', kernel_initializer='glorot_uniform',
    optimizer='adam'
)

PPMI_history = PPMI_model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=15, shuffle=True)

# MODEL EVALUATION

In [None]:
# model predictions
glove_y_pred = GLOVE_model.predict(X_test)
tf_idf_y_pred = TF_IDF_model.predict(X_test)
ppmi_y_pred = PPMI_model.predict(X_test)

# evaluate the model on the test data
glove_loss, glove_accuracy = GLOVE_model.evaluate(X_test, y_test)
tf_idf_loss, tf_idf_accuracy = TF_IDF_model.evaluate(X_test, y_test)
ppmi_loss, ppmi_accuracy = PPMI_model.evaluate(X_test, y_test)

In [None]:
# Extract training history
glove_training_accuracy = GLOVE_history.history['accuracy']
glove_validation_accuracy = GLOVE_history.history['val_accuracy']

tf_idf_training_accuracy = TF_IDF_history.history['accuracy']
tf_idf_validation_accuracy = TF_IDF_history.history['val_accuracy']

ppmi_training_accuracy = PPMI_history.history['accuracy']
ppmi_validation_accuracy = PPMI_history.history['val_accuracy']


# Example data
epochs = range(1, len(glove_validation_accuracy) + 1)

plt.figure(figsize=(12, 6))

# Plot training and validation accuracy for GLOVE model
plt.plot(epochs, glove_training_accuracy, linestyle='-', linewidth=2, label='GLOVE Train Acc', color='green')
plt.plot(epochs, glove_validation_accuracy, linestyle='--', linewidth=2, label='GLOVE Val Acc', color='green')

# Plot training and validation accuracy for PPMI model
plt.plot(epochs, ppmi_training_accuracy, linestyle='-', linewidth=2, label='PPMI Train Acc', color='red')
plt.plot(epochs, ppmi_validation_accuracy, linestyle='--', linewidth=2, label='PPMI Val Acc', color='red')

# Plot training and validation accuracy for TF-IDF model
plt.plot(epochs, tf_idf_training_accuracy, linestyle='-', linewidth=2, label='TF-IDF Train Acc', color='purple')
plt.plot(epochs, tf_idf_validation_accuracy, linestyle='--', linewidth=2, label='TF-IDF Val Acc', color='purple')

plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.legend(loc='upper left')
plt.show()

In [None]:
model_names = ['GloVe', 'TF-IDF', 'PPMI']
loss_values = [glove_loss, tf_idf_loss, ppmi_loss]
accuracy_values = [glove_accuracy, tf_idf_accuracy, ppmi_accuracy]


plt.figure(figsize=(12, 6))

# Plot loss
plt.subplot(1, 2, 1)
plt.bar(model_names, loss_values, color='skyblue')
plt.title('Loss on Test Data')
plt.xlabel('Models')
plt.ylabel('Loss')

# Plot accuracy
plt.subplot(1, 2, 2)
plt.bar(model_names, accuracy_values, color='lightgreen')
plt.title('Accuracy on Test Data')
plt.xlabel('Models')
plt.ylabel('Accuracy')

plt.tight_layout()
plt.show()


In [None]:
y_true_indices = np.argmax(y_test, axis=1)
y_pred_indices = np.argmax(glove_y_pred, axis=1)

# Create the confusion matrix
cm = confusion_matrix(y_true_indices, y_pred_indices)

class_labels = ['A', 'B1', 'B2', 'C1', 'C2']

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix GloVe')
plt.show()

# BERT

In [None]:
X = np.array(CORPUS)
y = label_encoder.fit_transform(CEFR_OVERALL)

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)

# split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [None]:
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'

In [None]:
bert_perprocess_model = hub.KerasLayer(preprocess_url, name='preprocessing')
bert_model = hub.KerasLayer(encoder_url, name='BERT_encoder')

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_perprocess_model(text_input)
bert_output = bert_model(preprocessed_text)['pooled_output']

In [None]:
input_layer = tf.keras.layers.Dropout(0.2, name='dropout')(bert_output)
output_layer = tf.keras.layers.Dense(5, activation='softmax', name = 'output')(input_layer)

In [None]:
model = tf.keras.Model(inputs=[text_input], outputs=[output_layer])
model.summary()


opt = tf.keras.optimizers.AdamW(
    learning_rate=0.008,
    epsilon=1e-07
)

model.compile(optimizer=opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_val, y_val))

# BERT EVALUATION

In [None]:
bert_loss, bert_accuracy = model.evaluate(X_test, y_test)
bert_y_pred = model.predict(X_test)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

ax.bar('BERT - Loss', bert_loss, color='blue', alpha=0.6, label='Loss')
ax.bar('BERT - Accuracy', bert_accuracy, color='green', alpha=0.6, label='Accuracy')
ax.set_ylabel('Loss / Accuracy Value')

ax.set_title('Loss and Accuracy for BERT')
ax.legend()

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
y_true_indices = y_test
y_pred_indices = np.argmax(bert_y_pred, axis=1)

cm = confusion_matrix(y_true_indices, y_pred_indices)

class_labels = ['A', 'B1', 'B2', 'C1', 'C2']

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
VALENTINA BITETTO 508285, ANA SUAREZ 503162, ANDREA LOLLI 503035