In [None]:
# Run pip install to get libraries:
%pip install editdistance unidecode

# Import
import numpy as np
import pandas as pd
#from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
import sklearn.metrics as metrics
#from sklearn.model_selection import train_test_split
import lightgbm as lgb

import editdistance
from unidecode import unidecode

In [None]:
# File Paths
PARQUET_TRAIN = "data/processed/train_slam_with_features.parquet"
PARQUET_DEV   = "data/processed/dev_slam_with_features.parquet"
PARQUET_TEST  = "data/processed/test_slam_with_features.parquet"

In [None]:
# Load the Data
train = pd.read_parquet(PARQUET_TRAIN)
dev   = pd.read_parquet(PARQUET_DEV)

In [None]:
# Load Word Comparisons File
trans = pd.read_csv("data/processed/language_translation_table.csv")

# maps from translation table
en_map = dict(zip(trans["uni_lemma"], trans["English"].astype(str)))
fr_map = dict(zip(trans["uni_lemma"], trans["French"].astype(str)))
es_map = dict(zip(trans["uni_lemma"], trans["Spanish"].astype(str)))

def norm(s):
    if pd.isna(s): return np.nan
    s = unidecode(str(s)).lower().strip()
    parts = s.split()
    if len(parts) == 2 and parts[0] in {"to","i","we","you","they","he","she","the","i'll"}:
        s = parts[1]
    elif len(parts) > 2 and len(parts) > 1 and parts[1] == "will":
        s = " ".join(parts[2:])
    return s
 
def add_edit_distance_features(df):
    key = df["uni_lemma"]

    # english and target by l2
    eng = key.map(en_map)
    tgt = np.where(df["l2"].eq("fr"), key.map(fr_map),
        np.where(df["l2"].eq("es"), key.map(es_map), np.nan))

    eng_n = pd.Series(eng).map(norm)
    tgt_n = pd.Series(tgt).map(norm)

    mask = eng_n.notna() & tgt_n.notna()
    dist = np.full(len(df), np.nan, dtype=float)
    dist[mask.values] = [editdistance.eval(a, b) for a, b in zip(eng_n[mask], tgt_n[mask])]
    maxlen = np.maximum(eng_n.str.len(), tgt_n.str.len()).replace(0, 1)
    frac = dist / maxlen.to_numpy()

    df["edit_l2"] = dist
    df["edit_l2_frac"] = frac

    return df

In [None]:
train = add_edit_distance_features(train)
dev = add_edit_distance_features(dev)

In [None]:
train.head()

In [None]:
train.columns.values

In [None]:
print(train.dtypes)

In [None]:
# Select predictors and dependent variable
predictors = ['median_aoa', 'edit_l2_frac', 'user', 'days', 'growth_rate']
target = 'token_wrong'


# Encode categoricals
train['user'] = train['user'].astype('category')
dev['user'] = dev['user'].astype('category')

categorical_features = ['user']

In [None]:
# Create LightGBM dataset
lgb_train = lgb.Dataset(train[predictors], label=train[target], categorical_feature=categorical_features)
lgb_val = lgb.Dataset(dev[predictors],     label=dev[target],   categorical_feature=categorical_features)

In [None]:
# Set parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 512,
    'min_data_in_leaf': 100,
    "cat_smooth": 200,
    'feature_fraction': 0.7,
}

In [None]:
# Train model
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    num_boost_round=500,
    callbacks=[
        lgb.log_evaluation(period=50),           # print every 50 iters
        lgb.early_stopping(stopping_rounds=50),  # set best_iteration
    ]
)

# Check parameters settings
print("Trees trained:", model.num_trees()) 
print("Current iteration:", model.current_iteration())
print("Best iteration:", model.best_iteration)
print("Eval results:", model.best_score)

# Predict and evaluate
y_pred = model.predict(dev[predictors], num_iteration=model.best_iteration)
y_pred_label = (y_pred > 0.5).astype(int)

auc = metrics.roc_auc_score(dev[target], y_pred)
logloss = metrics.log_loss(dev[target], y_pred)
accuracy = metrics.accuracy_score(dev[target], y_pred_label)
precision = metrics.precision_score(dev[target], y_pred_label)
recall = metrics.recall_score(dev[target], y_pred_label)
f1 = metrics.f1_score(dev[target], y_pred_label)
conf_matrix = metrics.confusion_matrix(dev[target], y_pred_label)

print("===== Evaluation Metrics =====")
print(f"AUC: {auc:.4f}")
print(f"Log Loss:   {logloss:.4f}")
print(f"Accuracy:   {accuracy:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall:     {recall:.4f}")
print(f"F1 Score:   {f1:.4f}")