Load Data
https://www.kaggle.com/datasets/amontgomerie/cefr-levelled-english-texts

In [1]:
import pandas as pd
cefr_texts = pd.read_csv("./cefr_leveled_texts.csv")
cefr_texts.head(5)

Unnamed: 0,text,label
0,Hi!\nI've been meaning to write for ages and f...,B2
1,﻿It was not so much how hard people found the ...,B2
2,Keith recently came back from a trip to Chicag...,B2
3,"The Griffith Observatory is a planetarium, and...",B2
4,-LRB- The Hollywood Reporter -RRB- It's offici...,B2


Data preprocessing

In [2]:
import re

def preprocessing(text: str) -> str:
    text = text.lower()
    text = re.sub(r"-[a-z]+-","", text)
    text = text.replace("\n", " ")
    text = re.sub(r"\s+"," ", text)
    return text

cefr_texts.text = cefr_texts.text.apply(lambda x: preprocessing(x))
cefr_texts.head(5)

Unnamed: 0,text,label
0,hi! i've been meaning to write for ages and fi...,B2
1,﻿it was not so much how hard people found the ...,B2
2,keith recently came back from a trip to chicag...,B2
3,"the griffith observatory is a planetarium, and...",B2
4,the hollywood reporter it's official: amc's t...,B2


In [3]:
cefr_texts.label.value_counts()/len(cefr_texts)

label
A1    0.192771
B2    0.191432
A2    0.182062
C1    0.161312
B1    0.137216
C2    0.135207
Name: count, dtype: float64

In [4]:
import os
import sys
base_dir = os.path.abspath(os.path.join(os.getcwd(), '../../src'))
os.chdir(base_dir)
sys.path.append(base_dir)

In [5]:
from res.wordsDegree.WordsDegree import get_degrees
from res.materials_VOC import CEFR_DICTIONARY_DF
from collections import Counter
from preprocessing.voc import voc_1_preprocessed_text

Start making general dict


[nltk_data] Downloading package wordnet to /home/qw1n/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/qw1n/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


VOC 1 Func definition

In [6]:
import tqdm

def voc_1(text: str) -> tuple[Counter, float, int]:
    preprocessed_text, words_count = voc_1_preprocessed_text(text=text, stemming=False)
    cerf_dict = CEFR_DICTIONARY_DF
    cerf_levels, undefined = get_degrees(preprocessed_text)
    for token in undefined:
        if token in cerf_dict.headword.values:
            cerf_levels.append(cerf_dict[cerf_dict.headword == token].CEFR.values[0])
        else:
            cerf_levels.append("A1")
    cerf_counter = Counter(cerf_levels)
    return cerf_counter, len(preprocessed_text) / words_count, len(preprocessed_text)

features = pd.DataFrame(columns=["uniq", "words_a1", "words_a2", "words_b1", "words_b2", "words_c1", "words_c2", "text_cefr"])

for i in tqdm.trange(len(cefr_texts)):
    counter, uniq, uniq_count = voc_1(cefr_texts.text.values[i])
    row = {
        "uniq": uniq, 
        "words_a1": counter["A1"]/uniq_count, 
        "words_a2": counter["A2"]/uniq_count, 
        "words_b1": counter["B1"]/uniq_count, 
        "words_b2": counter["B2"]/uniq_count, 
        "words_c1": counter["C1"]/uniq_count, 
        "words_c2": counter["C2"]/uniq_count, 
        "text_cefr": cefr_texts.label.values[i]
    }
    features.loc[len(features)] = row
features.head(5)

100%|██████████| 1494/1494 [12:14<00:00,  2.04it/s]


Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
0,0.353982,0.60625,0.2,0.10625,0.075,0.00625,0.00625,B2
1,0.352584,0.37931,0.228448,0.150862,0.185345,0.051724,0.00431,B2
2,0.395745,0.408602,0.193548,0.225806,0.16129,0.010753,0.0,B2
3,0.393103,0.429825,0.166667,0.175439,0.157895,0.061404,0.008772,B2
4,0.413374,0.573529,0.139706,0.102941,0.139706,0.036765,0.007353,B2


Features columns:
uniq - unique words proportion;
words_* - proportion of * level words from unique;
text_cefr - CEFR level of text;

CEFR Level as 1..6 values

In [7]:
features_cefr_as_num = features.copy()
cefr_to_num = {
    'A1': 1,
    'A2': 2,
    'B1': 3,
    'B2': 4,
    'C1': 5,
    'C2': 6
}

features_cefr_as_num['text_cefr'] = features['text_cefr'].map(cefr_to_num)
features_cefr_as_num.corr()

Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
uniq,1.0,-0.205286,-0.01843,0.082227,0.260154,0.096546,0.112037,0.346841
words_a1,-0.205286,1.0,-0.369729,-0.577213,-0.762104,-0.511128,-0.251865,-0.699266
words_a2,-0.01843,-0.369729,1.0,-0.045488,-0.047032,-0.143426,-0.06819,-0.005509
words_b1,0.082227,-0.577213,-0.045488,1.0,0.232315,0.156899,0.108819,0.342485
words_b2,0.260154,-0.762104,-0.047032,0.232315,1.0,0.378525,0.253723,0.693839
words_c1,0.096546,-0.511128,-0.143426,0.156899,0.378525,1.0,0.22189,0.531615
words_c2,0.112037,-0.251865,-0.06819,0.108819,0.253723,0.22189,1.0,0.334225
text_cefr,0.346841,-0.699266,-0.005509,0.342485,0.693839,0.531615,0.334225,1.0


CEFR Levels as onehot values

In [8]:
features_cefr_as_num = features.copy()
cefr_to_num = {
    'A1': 1,
    'A2': 0,
    'B1': 0,
    'B2': 0,
    'C1': 0,
    'C2': 0
}

features_cefr_as_num['text_cefr'] = features['text_cefr'].map(cefr_to_num)
features_cefr_as_num.corr()

Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
uniq,1.0,-0.205286,-0.01843,0.082227,0.260154,0.096546,0.112037,-0.223974
words_a1,-0.205286,1.0,-0.369729,-0.577213,-0.762104,-0.511128,-0.251865,0.48407
words_a2,-0.01843,-0.369729,1.0,-0.045488,-0.047032,-0.143426,-0.06819,-0.038494
words_b1,0.082227,-0.577213,-0.045488,1.0,0.232315,0.156899,0.108819,-0.301166
words_b2,0.260154,-0.762104,-0.047032,0.232315,1.0,0.378525,0.253723,-0.461041
words_c1,0.096546,-0.511128,-0.143426,0.156899,0.378525,1.0,0.22189,-0.252806
words_c2,0.112037,-0.251865,-0.06819,0.108819,0.253723,0.22189,1.0,-0.168215
text_cefr,-0.223974,0.48407,-0.038494,-0.301166,-0.461041,-0.252806,-0.168215,1.0


In [9]:
features_cefr_as_num = features.copy()
cefr_to_num = {
    'A1': 0,
    'A2': 1,
    'B1': 0,
    'B2': 0,
    'C1': 0,
    'C2': 0
}

features_cefr_as_num['text_cefr'] = features['text_cefr'].map(cefr_to_num)
features_cefr_as_num.corr()

Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
uniq,1.0,-0.205286,-0.01843,0.082227,0.260154,0.096546,0.112037,-0.23199
words_a1,-0.205286,1.0,-0.369729,-0.577213,-0.762104,-0.511128,-0.251865,0.268869
words_a2,-0.01843,-0.369729,1.0,-0.045488,-0.047032,-0.143426,-0.06819,0.034151
words_b1,0.082227,-0.577213,-0.045488,1.0,0.232315,0.156899,0.108819,-0.073834
words_b2,0.260154,-0.762104,-0.047032,0.232315,1.0,0.378525,0.253723,-0.30706
words_c1,0.096546,-0.511128,-0.143426,0.156899,0.378525,1.0,0.22189,-0.25664
words_c2,0.112037,-0.251865,-0.06819,0.108819,0.253723,0.22189,1.0,-0.154955
text_cefr,-0.23199,0.268869,0.034151,-0.073834,-0.30706,-0.25664,-0.154955,1.0


In [10]:
features_cefr_as_num = features.copy()
cefr_to_num = {
    'A1': 0,
    'A2': 0,
    'B1': 1,
    'B2': 0,
    'C1': 0,
    'C2': 0
}

features_cefr_as_num['text_cefr'] = features['text_cefr'].map(cefr_to_num)
features_cefr_as_num.corr()

Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
uniq,1.0,-0.205286,-0.01843,0.082227,0.260154,0.096546,0.112037,0.008868
words_a1,-0.205286,1.0,-0.369729,-0.577213,-0.762104,-0.511128,-0.251865,0.035059
words_a2,-0.01843,-0.369729,1.0,-0.045488,-0.047032,-0.143426,-0.06819,0.011315
words_b1,0.082227,-0.577213,-0.045488,1.0,0.232315,0.156899,0.108819,0.015137
words_b2,0.260154,-0.762104,-0.047032,0.232315,1.0,0.378525,0.253723,-0.014605
words_c1,0.096546,-0.511128,-0.143426,0.156899,0.378525,1.0,0.22189,-0.126277
words_c2,0.112037,-0.251865,-0.06819,0.108819,0.253723,0.22189,1.0,-0.058849
text_cefr,0.008868,0.035059,0.011315,0.015137,-0.014605,-0.126277,-0.058849,1.0


In [11]:
features_cefr_as_num = features.copy()
cefr_to_num = {
    'A1': 0,
    'A2': 0,
    'B1': 0,
    'B2': 1,
    'C1': 0,
    'C2': 0
}

features_cefr_as_num['text_cefr'] = features['text_cefr'].map(cefr_to_num)
features_cefr_as_num.corr()

Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
uniq,1.0,-0.205286,-0.01843,0.082227,0.260154,0.096546,0.112037,0.183207
words_a1,-0.205286,1.0,-0.369729,-0.577213,-0.762104,-0.511128,-0.251865,-0.137783
words_a2,-0.01843,-0.369729,1.0,-0.045488,-0.047032,-0.143426,-0.06819,0.004358
words_b1,0.082227,-0.577213,-0.045488,1.0,0.232315,0.156899,0.108819,0.077763
words_b2,0.260154,-0.762104,-0.047032,0.232315,1.0,0.378525,0.253723,0.149536
words_c1,0.096546,-0.511128,-0.143426,0.156899,0.378525,1.0,0.22189,0.061007
words_c2,0.112037,-0.251865,-0.06819,0.108819,0.253723,0.22189,1.0,0.016247
text_cefr,0.183207,-0.137783,0.004358,0.077763,0.149536,0.061007,0.016247,1.0


In [12]:
features_cefr_as_num = features.copy()
cefr_to_num = {
    'A1': 0,
    'A2': 0,
    'B1': 0,
    'B2': 0,
    'C1': 1,
    'C2': 0
}

features_cefr_as_num['text_cefr'] = features['text_cefr'].map(cefr_to_num)
features_cefr_as_num.corr()

Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
uniq,1.0,-0.205286,-0.01843,0.082227,0.260154,0.096546,0.112037,0.156753
words_a1,-0.205286,1.0,-0.369729,-0.577213,-0.762104,-0.511128,-0.251865,-0.299883
words_a2,-0.01843,-0.369729,1.0,-0.045488,-0.047032,-0.143426,-0.06819,0.047451
words_b1,0.082227,-0.577213,-0.045488,1.0,0.232315,0.156899,0.108819,0.135051
words_b2,0.260154,-0.762104,-0.047032,0.232315,1.0,0.378525,0.253723,0.279176
words_c1,0.096546,-0.511128,-0.143426,0.156899,0.378525,1.0,0.22189,0.20256
words_c2,0.112037,-0.251865,-0.06819,0.108819,0.253723,0.22189,1.0,0.139911
text_cefr,0.156753,-0.299883,0.047451,0.135051,0.279176,0.20256,0.139911,1.0


In [13]:
features_cefr_as_num = features.copy()
cefr_to_num = {
    'A1': 0,
    'A2': 0,
    'B1': 0,
    'B2': 0,
    'C1': 0,
    'C2': 1
}

features_cefr_as_num['text_cefr'] = features['text_cefr'].map(cefr_to_num)
features_cefr_as_num.corr()

Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
uniq,1.0,-0.205286,-0.01843,0.082227,0.260154,0.096546,0.112037,0.131861
words_a1,-0.205286,1.0,-0.369729,-0.577213,-0.762104,-0.511128,-0.251865,-0.416035
words_a2,-0.01843,-0.369729,1.0,-0.045488,-0.047032,-0.143426,-0.06819,-0.061575
words_b1,0.082227,-0.577213,-0.045488,1.0,0.232315,0.156899,0.108819,0.180783
words_b2,0.260154,-0.762104,-0.047032,0.232315,1.0,0.378525,0.253723,0.420738
words_c1,0.096546,-0.511128,-0.143426,0.156899,0.378525,1.0,0.22189,0.420252
words_c2,0.112037,-0.251865,-0.06819,0.108819,0.253723,0.22189,1.0,0.258953
text_cefr,0.131861,-0.416035,-0.061575,0.180783,0.420738,0.420252,0.258953,1.0


In [14]:
features_cefr_as_num

Unnamed: 0,uniq,words_a1,words_a2,words_b1,words_b2,words_c1,words_c2,text_cefr
0,0.353982,0.606250,0.200000,0.106250,0.075000,0.006250,0.006250,0
1,0.352584,0.379310,0.228448,0.150862,0.185345,0.051724,0.004310,0
2,0.395745,0.408602,0.193548,0.225806,0.161290,0.010753,0.000000,0
3,0.393103,0.429825,0.166667,0.175439,0.157895,0.061404,0.008772,0
4,0.413374,0.573529,0.139706,0.102941,0.139706,0.036765,0.007353,0
...,...,...,...,...,...,...,...,...
1489,0.354108,0.376000,0.192000,0.128000,0.240000,0.048000,0.016000,1
1490,0.439338,0.393305,0.205021,0.121339,0.225941,0.050209,0.004184,1
1491,0.369748,0.458333,0.155303,0.162879,0.132576,0.087121,0.003788,1
1492,0.423729,0.410909,0.167273,0.163636,0.185455,0.065455,0.007273,1


Model

In [16]:
import tensorflow as tf

In [55]:
model_features = features_cefr_as_num.drop(columns=["text_cefr"]).to_numpy()
model_targets = features_cefr_as_num.text_cefr.to_numpy()

dataset = tf.data.Dataset.from_tensor_slices((model_features, model_targets))

train_prop = 0.8

dataset = dataset.shuffle(buffer_size=len(model_features))

dataset_train = dataset.take(int(train_prop * len(model_features)))
dataset_test = dataset.skip(int(train_prop * len(model_features)))

In [56]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MSE
from tensorflow.keras.metrics import MAE
from tensorflow.keras.callbacks import EarlyStopping

In [57]:
EPOCHS = 1000
LEARNING_RATE = 0.05
BATCH = 32

dataset_train = dataset_train.batch(BATCH)
dataset_test = dataset_test.batch(BATCH)

compile_params={
    "optimizer": Adam(learning_rate=LEARNING_RATE),
    "loss": MSE,
    "metrics": [MAE],

}

In [58]:
from tensorflow.keras.layers import Dense, InputLayer

In [59]:
linear_regression = Sequential([
    InputLayer(shape=(7,)),
    Dense(units=1, activation="linear")
])

linear_regression.compile(**compile_params)

In [61]:
history = linear_regression.fit(
    dataset_train, 
    validation_data= dataset_test,
    epochs = EPOCHS,
    )

Epoch 1/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 975us/step - loss: 1.9395 - mean_absolute_error: 1.2073 - val_loss: 1.8331 - val_mean_absolute_error: 1.1641
Epoch 2/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.8050 - mean_absolute_error: 1.1567 - val_loss: 1.6291 - val_mean_absolute_error: 1.0930
Epoch 3/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.6513 - mean_absolute_error: 1.0859 - val_loss: 1.4956 - val_mean_absolute_error: 1.0356
Epoch 4/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 946us/step - loss: 1.5236 - mean_absolute_error: 1.0340 - val_loss: 1.3869 - val_mean_absolute_error: 1.0036
Epoch 5/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.3871 - mean_absolute_error: 0.9629 - val_loss: 1.2904 - val_mean_absolute_error: 0.9171
Epoch 6/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m