# Feedback Prize - English Language Learning

In [2]:
# Standard python libraries
import os
import time

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt

## EDA

In [5]:
import pandas as pd

In [6]:
train_df = pd.read_csv("data/train.csv")
train_df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   text_id      3911 non-null   object 
 1   full_text    3911 non-null   object 
 2   cohesion     3911 non-null   float64
 3   syntax       3911 non-null   float64
 4   vocabulary   3911 non-null   float64
 5   phraseology  3911 non-null   float64
 6   grammar      3911 non-null   float64
 7   conventions  3911 non-null   float64
dtypes: float64(6), object(2)
memory usage: 244.6+ KB


In [8]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


In [9]:
targets = list(sample_submission.columns[1:])
print(targets)

['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']


In [10]:
test_df = pd.read_csv("data/test.csv")
test_df.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_id    3 non-null      object
 1   full_text  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [12]:
train_df.shape, test_df.shape, sample_submission.shape

((3911, 8), (3, 2), (3, 7))

## LAML

In [21]:
import warnings
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [16]:
N_THREADS = 4 # threads cnt for lgbm and linear models
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 6 * 3600 # Time in seconds for automl run

In [17]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [18]:
train_data, test_data = train_test_split(
    train_df, 
    train_size=0.8,
    random_state=RANDOM_STATE
)

In [24]:
%%time

task = Task('reg')
submission = test_df.copy()

for target in targets:

    roles = {
        'target': target, 
        'text': ['full_text'],
        'drop': ['text_id']
    }

    automl = TabularNLPAutoML(
        task = task, 
        timeout = TIMEOUT,
        cpu_limit = N_THREADS,
        reader_params = {'cv': 5},
        general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb']]},
        text_params = {'lang': 'en'},
        nn_params = {
            'lang': 'en', 
            'bert_name': 'vinai/bertweet-base', 
            'opt_params': { 'lr': 1e-5},
            'max_length': 300, 'bs': 13,
            'n_epoch': 5
        },
    )

    oof_pred = automl.fit_predict(train_data[['full_text', target]], roles = roles)

    test_pred = automl.predict(test_data[["full_text"]])
    y_predicted = test_pred.data[:]
    y_test = test_data[target]

    mape = mean_absolute_percentage_error(y_pred=y_predicted, y_true=y_test)
    print(f"PARAM: {target}, MAPE: {mape}")

100%|██████████| 3128/3128 [00:33<00:00, 94.20it/s] 


PARAM: cohesion, MAPE: 0.16166100214291765


100%|██████████| 3128/3128 [01:01<00:00, 50.58it/s] 


PARAM: syntax, MAPE: 0.16284551905112524


100%|██████████| 3128/3128 [00:33<00:00, 92.22it/s] 


PARAM: vocabulary, MAPE: 0.1293790193217789
