In [1]:
# Run pip install to get libraries:
%pip install editdistance unidecode

# Import
import numpy as np
import pandas as pd
#from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
import sklearn.metrics as metrics
#from sklearn.model_selection import train_test_split
import lightgbm as lgb

import editdistance
from unidecode import unidecode

Note: you may need to restart the kernel to use updated packages.


In [2]:
# File Paths
PARQUET_TRAIN = "data/processed/train_slam_with_features.parquet"
PARQUET_DEV   = "data/processed/dev_slam_with_features.parquet"
PARQUET_TEST  = "data/processed/test_slam_with_features.parquet"

In [3]:
# Load the Data
train = pd.read_parquet(PARQUET_TRAIN)
dev   = pd.read_parquet(PARQUET_DEV)

In [4]:
# Load Word Comparisons File
trans = pd.read_csv("data/processed/language_translation_table.csv")

# maps from translation table
en_map = dict(zip(trans["uni_lemma"], trans["English"].astype(str)))
fr_map = dict(zip(trans["uni_lemma"], trans["French"].astype(str)))
es_map = dict(zip(trans["uni_lemma"], trans["Spanish"].astype(str)))

def norm(s):
    if pd.isna(s): return np.nan
    s = unidecode(str(s)).lower().strip()
    parts = s.split()
    if len(parts) == 2 and parts[0] in {"to","i","we","you","they","he","she","the","i'll"}:
        s = parts[1]
    elif len(parts) > 2 and len(parts) > 1 and parts[1] == "will":
        s = " ".join(parts[2:])
    return s
 
def add_edit_distance_features(df):
    key = df["uni_lemma"]

    # english and target by l2
    eng = key.map(en_map)
    tgt = np.where(df["l2"].eq("fr"), key.map(fr_map),
        np.where(df["l2"].eq("es"), key.map(es_map), np.nan))

    eng_n = pd.Series(eng).map(norm)
    tgt_n = pd.Series(tgt).map(norm)

    mask = eng_n.notna() & tgt_n.notna()
    dist = np.full(len(df), np.nan, dtype=float)
    dist[mask.values] = [editdistance.eval(a, b) for a, b in zip(eng_n[mask], tgt_n[mask])]
    maxlen = np.maximum(eng_n.str.len(), tgt_n.str.len()).replace(0, 1)
    frac = dist / maxlen.to_numpy()

    df["edit_l2"] = dist
    df["edit_l2_frac"] = frac

    return df

In [5]:
train = add_edit_distance_features(train)
dev = add_edit_distance_features(dev)

In [6]:
train.head()

Unnamed: 0,token_id,token,token_pos,token_morph,token_dep_label,token_edges,token_wrong,block_id,prompt,user,...,format,time,l2,l1,uni_lemma,category,growth_rate,median_aoa,edit_l2,edit_l2_frac
0,8XTyQUAl0101,Le,DET,Definite=Def|Gender=Masc|Number=Sing|fPOS=DET++,det,2,0,1,The boy,YjS/mQOx,...,reverse_translate,14.0,fr,en,,,,,,
1,8XTyQUAl0102,garçon,NOUN,Gender=Masc|Number=Sing|fPOS=NOUN++,ROOT,0,0,1,The boy,YjS/mQOx,...,reverse_translate,14.0,fr,en,boy,people,0.226911,24.350613,5.0,0.833333
2,8XTyQUAl0201,Je,PRON,Number=Sing|Person=1|PronType=Prs|fPOS=PRON++,nsubj,4,0,2,I am a woman.,YjS/mQOx,...,reverse_translate,14.0,fr,en,,,,,,
3,8XTyQUAl0202,suis,VERB,Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbF...,cop,4,0,2,I am a woman.,YjS/mQOx,...,reverse_translate,14.0,fr,en,am,helping_verbs,0.294798,29.568209,4.0,1.0
4,8XTyQUAl0203,une,DET,Definite=Ind|Gender=Fem|Number=Sing|PronType=D...,det,4,0,2,I am a woman.,YjS/mQOx,...,reverse_translate,14.0,fr,en,a,quantifiers,0.239488,27.849093,5.0,0.833333


In [7]:
train.columns.values

array(['token_id', 'token', 'token_pos', 'token_morph', 'token_dep_label',
       'token_edges', 'token_wrong', 'block_id', 'prompt', 'user',
       'countries', 'days', 'client', 'session', 'format', 'time', 'l2',
       'l1', 'uni_lemma', 'category', 'growth_rate', 'median_aoa',
       'edit_l2', 'edit_l2_frac'], dtype=object)

In [8]:
print(train.dtypes)

token_id            object
token               object
token_pos           object
token_morph         object
token_dep_label     object
token_edges         object
token_wrong          int64
block_id             int64
prompt              object
user                object
countries           object
days               float64
client              object
session             object
format              object
time               float64
l2                  object
l1                  object
uni_lemma           object
category            object
growth_rate        float64
median_aoa         float64
edit_l2            float64
edit_l2_frac       float64
dtype: object


In [9]:
# Select predictors and dependent variable
predictors = ['median_aoa', 'edit_l2_frac', 'user', 'days', 'growth_rate']
target = 'token_wrong'


# Encode categoricals
train['user'] = train['user'].astype('category')
dev['user'] = dev['user'].astype('category')

categorical_features = ['user']

In [10]:
# Create LightGBM dataset
lgb_train = lgb.Dataset(train[predictors], label=train[target], categorical_feature=categorical_features)
lgb_val = lgb.Dataset(dev[predictors],     label=dev[target],   categorical_feature=categorical_features)

In [11]:
# Set parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 512,
    'min_data_in_leaf': 100,
    "cat_smooth": 200,
    'feature_fraction': 0.7,
}

In [12]:
# Train model
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    num_boost_round=500,
    callbacks=[
        lgb.log_evaluation(period=50),           # print every 50 iters
        lgb.early_stopping(stopping_rounds=50),  # set best_iteration
    ]
)

# Check parameters settings
print("Trees trained:", model.num_trees()) 
print("Current iteration:", model.current_iteration())
print("Best iteration:", model.best_iteration)
print("Eval results:", model.best_score)

# Predict and evaluate
y_pred = model.predict(dev[predictors], num_iteration=model.best_iteration)
y_pred_label = (y_pred > 0.5).astype(int)

auc = metrics.roc_auc_score(dev[target], y_pred)
logloss = metrics.log_loss(dev[target], y_pred)
accuracy = metrics.accuracy_score(dev[target], y_pred_label)
precision = metrics.precision_score(dev[target], y_pred_label)
recall = metrics.recall_score(dev[target], y_pred_label)
f1 = metrics.f1_score(dev[target], y_pred_label)
conf_matrix = metrics.confusion_matrix(dev[target], y_pred_label)

print("===== Evaluation Metrics =====")
print(f"AUC: {auc:.4f}")
print(f"Log Loss:   {logloss:.4f}")
print(f"Accuracy:   {accuracy:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall:     {recall:.4f}")
print(f"F1 Score:   {f1:.4f}")

[LightGBM] [Info] Number of positive: 758625, number of negative: 4764534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.167258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6887
[LightGBM] [Info] Number of data points in the train set: 5523159, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.137353 -> initscore=-1.837447
[LightGBM] [Info] Start training from score -1.837447
Training until validation scores don't improve for 50 rounds
[50]	train's auc: 0.690381	val's auc: 0.651017
[100]	train's auc: 0.700399	val's auc: 0.653975
[150]	train's auc: 0.706569	val's auc: 0.65466
[200]	train's auc: 0.710284	val's auc: 0.654699
[250]	train's auc: 0.712855	val's auc: 0.654634
Early stopping, best iteration is:
[224]	train's auc: 0.711558	val's auc: 0.654739
Trees trained: 224
Current iteration: 224
Best iteration