In [1]:
# --- IMPORTS ---
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
# --- CONFIG ---
LANG = "en_es"          # "en_es", "fr_en", "es_en", or "all"
N_USERS = None          # e.g., 200 for a quick smoke test, or None for all
ID_COL = 'token_id'
LABEL_COL = 'token_wrong' 
PARQUET_TRAIN = "../00_data/clean/train_slam_with_features.parquet"
PARQUET_DEV   = "../00_data/clean/dev_slam_with_features.parquet"
PARQUET_TEST  = "../00_data/clean/test_slam_with_features.parquet"

In [None]:
#  NYU Parameters ---
params = {
    'fr_en': {
        'application': 'binary', 'metric': 'auc', 'learning_rate': .05,
        'num_leaves': 256, 'min_data_in_leaf': 100, 'num_boost_round': 750,
        'cat_smooth': 200, 'feature_fraction': .7,
    },
    'en_es': {
        'application': 'binary', 'metric': 'auc', 'learning_rate': .05,
        'num_leaves': 512, 'min_data_in_leaf': 100, 'num_boost_round': 650,
        'cat_smooth': 200, 'feature_fraction': .7,
    },
    'es_en': {
        'application': 'binary', 'metric': 'auc', 'learning_rate': .05,
        'num_leaves': 512, 'min_data_in_leaf': 100, 'num_boost_round': 600,
        'cat_smooth': 200, 'feature_fraction': .7,
    },
    'all': {
        'application': 'binary', 'metric': 'auc', 'learning_rate': .05,
        'num_leaves': 1024, 'min_data_in_leaf': 100, 'num_boost_round': 750,
        'cat_smooth': 200, 'max_cat_threshold': 64, 'feature_fraction': .7,
    }
}

In [4]:
# --- LOAD THE PARQUETS ---
train = pd.read_parquet(PARQUET_TRAIN)
dev   = pd.read_parquet(PARQUET_DEV)

In [5]:

# Optional: filter to language pair to mimic NYU 'lang' flag
if LANG != "all":
    l2_val, l1_val = LANG.split('_', 1)  # e.g., 'fr_en' -> fr, en
    train = train[(train['l2'] == l2_val) & (train['l1'] == l1_val)]
    dev   = dev[(dev['l2']   == l2_val) & (dev['l1']   == l1_val)]

# Optional: take only the first N users (to match NYU "users" arg for quick tests)
if N_USERS is not None:
    keep_users = train['user'].drop_duplicates().head(N_USERS)
    train = train[train['user'].isin(keep_users)]
    dev   = dev[dev['user'].isin(keep_users)]


In [6]:
y_train = train[LABEL_COL]
y_dev   = dev[LABEL_COL]
ids_dev = dev[ID_COL]


In [7]:
# --- BUILD FEATURE DICTS TO MATCH NYU STYLE ---
def with_lang(x, lang_prefix):
    # NYU appends _{lang} to token/root; we’ll use the L2 (target) code
    return (str(x).lower() + '_' + lang_prefix) if pd.notnull(x) else '_none_' + '_' + lang_prefix

def row_to_feat(row, lang_prefix):
    d = {}

    # Core categoricals that NYU treats as categorical ids (later int-coded):
    # 'token', 'root', 'user'
    token_full = with_lang(row.get('token', ''), lang_prefix)
    # Use 'uni_lemma' as a stand-in for 'root' (closest analogue in your files)
    root_src = row.get('uni_lemma', row.get('token', ''))
    root_full = with_lang(root_src, lang_prefix)
    d['token'] = token_full
    d['root']  = root_full
    d['user']  = str(row.get('user', ''))

    # Exercise-level “one-hot string” fields:
    # NYU uses 'format:xxx', 'session:xxx', 'client:xxx' keys
    if pd.notnull(row.get('format')):
        d[f'format:{row["format"]}'] = 1.0
    if pd.notnull(row.get('session')):
        d[f'session:{row["session"]}'] = 1.0
    if pd.notnull(row.get('client')):
        d[f'client:{row["client"]}'] = 1.0

    # Numeric exercise-level features
    if pd.notnull(row.get('days')):
        d['days'] = float(row['days'])
    if pd.notnull(row.get('time')):
        try:
            d['time'] = float(row['time'])
        except Exception:
            pass  # keep going if time is null-like

    # Token-level: NYU adds part_of_speech, dependency_label one-hots
    if pd.notnull(row.get('token_pos')):
        d[f'part_of_speech:{row["token_pos"]}'] = 1.0
    if pd.notnull(row.get('token_dep_label')):
        d[f'dependency_label:{row["token_dep_label"]}'] = 1.0

    # Morphology: expand "key=value|key=value" → "morphological_feature:key_value"
    morph = row.get('token_morph')
    if pd.notnull(morph):
        for kv in str(morph).split('|'):
            if '=' in kv:
                k, v = kv.split('=', 1)
                d[f'morphological_feature:{k}_{v}'] = 1.0

    # (If you later add prev/next/parseroot tokens, include as:
    # d['prev_token']=..., d['next_token']=..., d['parseroot_token']=... )
    return d


In [9]:
train.columns.values

array(['token_id', 'token', 'token_pos', 'token_morph', 'token_dep_label',
       'token_edges', 'token_wrong', 'block_id', 'prompt', 'user',
       'countries', 'days', 'client', 'session', 'format', 'time', 'l2',
       'l1', 'uni_lemma', 'category', 'growth_rate', 'median_aoa'],
      dtype=object)

In [17]:
lang_prefix = (LANG[:2] if LANG != 'all' else 'xx')  # NYU uses 2-letter code
'''train_dicts = [row_to_feat(r, lang_prefix) for _, r in train.iterrows()]
dev_dicts   = [row_to_feat(r, lang_prefix) for _, r in dev.iterrows()]

# --- MAP BIG CATEGORICALS TO INTEGER IDS (NYU DOES THIS BEFORE DictVectorizer) ---
cat_keys = ['token', 'root', 'user']  # add 'prev_token','next_token','parseroot_token' if you later include them
value_maps = {k: {} for k in cat_keys}
for k in cat_keys:
    next_id = 0
    # combine train+dev to share the same id map
    for d in (train_dicts + dev_dicts):
        if k in d:
            v = d[k]
            if v not in value_maps[k]:
                value_maps[k][v] = next_id
                next_id += 1
            d[k] = value_maps[k][v]

# --- DICTVECTORIZER → SPARSE MATRICES (exactly like NYU) ---
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_dev   = dv.transform(dev_dicts)
feature_names = dv.feature_names_'''

# --- TRAIN LIGHTGBM (mirrors lightgbm_dev.py) ---
p = params[LANG if LANG in params else 'all']
#d_train = lgb.Dataset(X_train, label=y_train)
#d_valid = lgb.Dataset(X_dev,   label=y_dev)

'''feature_cols = [
    # include categorical and numeric columns you actually want
    'token', 'root', 'user', 'prev_token', 'next_token', 'parseroot_token',
]'''

feature_cols = [
    'token', 'block_id', 'user'
]

X_train = train[feature_cols]
X_dev   = dev[feature_cols]


# --- Encode categoricals automatically with LightGBM ---
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Make sure we’re not writing into a view
X_train = X_train.copy()
X_dev   = X_dev.copy()

# Convert ALL object cols in train/dev to aligned int codes (joint categories)
obj_cols = (
    X_train.select_dtypes(include=['object']).columns
    .union(X_dev.select_dtypes(include=['object']).columns)
)

for c in obj_cols:
    X_train[c] = X_train[c].astype('category')
    X_dev[c]   = X_dev[c].astype('category')
    cats = X_train[c].cat.categories.union(X_dev[c].cat.categories)
    X_train[c] = X_train[c].cat.set_categories(cats).cat.codes.astype('int32')
    X_dev[c]   = X_dev[c].cat.set_categories(cats).cat.codes.astype('int32')

In [18]:
d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols)
d_valid = lgb.Dataset(X_dev,   label=y_dev,   categorical_feature=categorical_cols)

In [19]:
'''bst = lgb.train(
    p, d_train, valid_sets=[d_train, d_valid],
    valid_names=['train','valid'],
    categorical_feature=[k for k in cat_keys if any(fn==k for fn in feature_names)],
    num_boost_round=p['num_boost_round'],
    verbose_eval=25
)'''

bst = lgb.train(
    p,
    d_train,
    valid_sets=[d_train, d_valid],
    valid_names=['train','valid'],
    num_boost_round=p['num_boost_round'],
    callbacks=[lgb.log_evaluation(25)]
)

# --- EVALUATE (AUC on dev) ---
dev_pred = bst.predict(X_dev)
auc = roc_auc_score(y_dev, dev_pred)
print(f"DEV AUC ({LANG}): {auc:.6f}")

[LightGBM] [Info] Number of positive: 330788, number of negative: 2292169
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3708
[LightGBM] [Info] Number of data points in the train set: 2622957, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.126113 -> initscore=-1.935776
[LightGBM] [Info] Start training from score -1.935776
[25]	train's auc: 0.745084	valid's auc: 0.71695
[50]	train's auc: 0.75404	valid's auc: 0.726334
[75]	train's auc: 0.755564	valid's auc: 0.727364
[100]	train's auc: 0.763055	valid's auc: 0.736583
[125]	train's auc: 0.764746	valid's auc: 0.738608
[150]	train's auc: 0.766615	valid's auc: 0.740992
[175]	train's auc: 0.76721	valid's auc: 0.741571
[200]	train's auc: 0.768135	valid's auc: 0.742442
[225]	train's auc: 0.768746	valid's auc: 0.7