In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
'''
# 1. Install all three core libraries together, forcing the desired stable versions.
!pip install numpy==1.26.4 scipy==1.15.3 scikit-learn==1.7.2 --upgrade --force-reinstall

print("\nüîÑ Step 2: Installing XGBoost without touching core dependencies...")

# 2. Install XGBoost, but critically, use --no-deps to prevent it from replacing NumPy.
!pip install xgboost --upgrade --force-reinstall --no-deps

# We also need to reinstall your TabPFN dependency since it might have been uninstalled
# by one of the previous commands (and it was the original source of the sklearn error).
# Note: You may need to replace this path if you changed it earlier.
!pip install /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl

print("\n‚úÖ Stable toolchain installation complete. **RESTART THE NOTEBOOK KERNEL NOW!**")
'''

# 1. Environment Setup

## Import Packages

In [None]:
import warnings
warnings.filterwarnings('ignore')
import gc
import ctypes
import os
import itertools
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import random
import pprint
import time
import copy
import lightgbm as lgb
import torch
import polars as pl
import optuna

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression,Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, PowerTransformer, RobustScaler, FunctionTransformer
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn import metrics
%matplotlib inline
from random import choice, choices
from functools import reduce, partial
from tqdm import tqdm
from itertools import cycle
from collections import Counter
from scipy import stats
from scipy.stats import skew, kurtosis
from transformers import BertTokenizer
from collections import Counter, defaultdict
from tqdm.autonotebook import tqdm
from math import sqrt
from sklearn import model_selection

def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()
clean_memory()

## Creating Dataframes of Kaggle Data

In [None]:
# Define paths to CSV files
TRAIN_LOGS   = "/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv"
TRAIN_SCORES = "/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv"
TEST_LOGS    = "/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv"
SAMPLE_SUB   = "/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv"

df_train_logs = pd.read_csv(TRAIN_LOGS)
df_train_scores = pd.read_csv(TRAIN_SCORES)
df_test_logs = pd.read_csv(TEST_LOGS)
df_sample_submission = pd.read_csv(SAMPLE_SUB)

print("Train logs:", TRAIN_LOGS)
print("Train scores:", TRAIN_SCORES)
print("Test logs:", TEST_LOGS)
print("Sample submission:", SAMPLE_SUB)

# 2. Feature Engineering

## Analysis (Function)

In [None]:
def analyse_data(df_orig):
    """
    Analyzes a dataframe for data quality and structure.
    Prints compact, useful diagnostics ‚Äî avoids unnecessary verbosity.
    """
    df = df_orig.copy()
    print("üìä ANALYSING DATAFRAME\n")

    # 1Ô∏è‚É£ Missing values summary
    na_counts = df.isna().sum()
    total_missing = na_counts.sum()
    if total_missing > 0:
        print(f"üî∏ Missing values detected in {sum(na_counts > 0)} / {len(df.columns)} columns")
        print(na_counts[na_counts > 0].sort_values(ascending=False))
    else:
        print("‚úÖ No missing values found.")
    
    # 2Ô∏è‚É£ Data type consistency check
    print("\nüß© Checking for inconsistent data types...")
    inconsistent_cols = []
    for column in df.columns:
        types = df[column].apply(type)
        majority_type = types.mode()[0]
        anomaly_mask = types != majority_type
        if anomaly_mask.any():
            inconsistent_cols.append(column)
            num_anomalies = anomaly_mask.sum()
            print(f"‚ö†Ô∏è  {column}: {num_anomalies} anomalous entries (expected {majority_type.__name__})")
    if not inconsistent_cols:
        print("‚úÖ All columns have consistent data types.")

    # 3Ô∏è‚É£ Negative numeric values
    numeric_cols = df.select_dtypes(include=["number"])
    neg_mask = (numeric_cols < 0).any()
    neg_cols = neg_mask[neg_mask].index.tolist()
    if neg_cols:
        print(f"\n‚ö†Ô∏è Columns with negative values ({len(neg_cols)}): {neg_cols}")
    else:
        print("\n‚úÖ No negative values in numeric columns.")

    # 4Ô∏è‚É£ Distinct value counts
    nunique = df.nunique()
    print("\nüì¶ Distinct values summary:")
    print(nunique.describe()[['min', 'max']])
    # Only show top 10 most unique columns
    top_unique = nunique.sort_values(ascending=False).head(10)
    print("üîπ Top 10 columns by unique count:")
    print(top_unique)

    # 5Ô∏è‚É£ Sample string columns (only small samples)
    obj_cols = df.select_dtypes(include=["object"]).columns
    if len(obj_cols) > 0:
        print("\nüìù Sample entries from text columns:")
        for col in obj_cols:
            unique_vals = df[col].dropna().unique()
            sample_count = min(len(unique_vals), 5)
            print(f"‚Ä¢ {col}: {unique_vals[:sample_count]}")
    else:
        print("\n‚úÖ No object/string columns found.")

    # ‚úÖ Final summary
    print("\nüìã Summary:")
    print(f"Rows: {df.shape[0]} | Columns: {df.shape[1]}")
    print("Analysis complete.\n")

## Transformation (Function)

In [None]:
def transform_data(df_orig):
    """
    Applies transformation steps to activity, event, and text_change columns.
    Prints only one example entry per stage for verification.
    """
    import re
    import numpy as np
    import pandas as pd

    df = df_orig.copy()
    print("üîß Transforming dataset...")

    # ==========================================================
    # 1Ô∏è‚É£ Transform 'activity' column
    # ==========================================================
    def calculate_move_distance(activity):
        move_pattern = r'Move From \[(-?\d+), (-?\d+)\] To \[(-?\d+), (-?\d+)\]'
        match = re.match(move_pattern, activity)
        if match:
            x1, y1, x2, y2 = map(int, match.groups())
            distance = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
            return f"move_{int(round(distance))}"
        else:
            return activity

    df["activity_trf"] = df["activity"].apply(calculate_move_distance)
    df["activity_trf"] = df["activity_trf"].replace({"Remove/Cut": "Cut"})

    print("\n‚úÖ Sample transformed 'activity_trf':")
    display(df[["activity", "activity_trf"]].head(1))

    # ==========================================================
    # 2Ô∏è‚É£ Transform 'down_event' and 'up_event' columns
    # ==========================================================
    def transform_event(event):
        event_str = str(event)
        if len(event_str) == 1 and event_str.isalnum():
            return "q"
        return event

    df["down_event_trf"] = df["down_event"].apply(transform_event)
    df["up_event_trf"] = df["up_event"].apply(transform_event)

    print("‚úÖ Sample transformed 'down_event' & 'up_event':")
    display(df[["down_event", "down_event_trf", "up_event", "up_event_trf"]].head(1))

    # Quick distinct summary (short)
    distinct_counts = df[["down_event_trf", "up_event_trf"]].nunique()
    print("Distinct transformed event types:")
    print(distinct_counts.to_dict())

    # ==========================================================
    # 3Ô∏è‚É£ Transform 'text_change' column
    # ==========================================================
    def parse_text_change(val):
        val = str(val).replace(" ", "space")
        if "q" not in val:
            return val
        elif "=>" in val:
            before, after = val.split("=>", 1)
            delta = len(after.strip()) - len(before.strip())
            if delta > 0:
                return f"q_add_{delta}"
            elif delta < 0:
                return f"q_subtract_{abs(delta)}"
            else:
                return "q_0"
        else:
            delta = len(val.strip())
            return f"q_add_{delta}" if delta > 0 else "q_0"

    df["text_change_trf"] = df["text_change"].apply(parse_text_change)

    print("\n‚úÖ Sample transformed 'text_change_trf':")
    display(df[["text_change", "text_change_trf"]].head(1))

    # ==========================================================
    # ‚úÖ Final Summary
    # ==========================================================
    print("\nüìã Transformation complete!")
    print(f"Rows: {df.shape[0]} | Columns: {df.shape[1]}")
    print(f"New columns added: activity_trf, down_event_trf, up_event_trf, text_change_trf\n")

    return df

## Clean (Function)

In [None]:
def clean_data(df_orig):
    """
    Cleans string-type columns in a DataFrame:
      - Converts text to lowercase
      - Strips leading/trailing spaces
    Prints one example row for verification after cleaning.
    """
    import pandas as pd

    df = df_orig.copy()
    print("üßπ Cleaning data...")

    obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
    if not obj_cols:
        print("‚úÖ No object-type columns found ‚Äî nothing to clean.")
        return df

    # Apply transformations
    for col in obj_cols:
        df[col] = df[col].astype(str).str.lower().str.strip()

    # Show one sample row to confirm cleaning
    print(f"‚úÖ Cleaned {len(obj_cols)} text columns.")
    print("üìã Sample after cleaning:")
    display(df[obj_cols].head(1))

    print(f"Rows: {df.shape[0]} | Columns: {df.shape[1]}\n")
    return df

## Aggregation (Function)

In [None]:
def aggregate_data(df_orig):
    """
    Aggregate raw writing process logs into essay-level behavioral features.
    Prints concise progress info and shows one sample entry after each major block.
    """
    import numpy as np
    import pandas as pd

    print("üßÆ Aggregating essay-level behavioral features...")

    # ==========================================================
    # 1Ô∏è‚É£ SORT & GROUP
    # ==========================================================
    df = df_orig.copy()
    df_sorted = df.sort_values(by=["id", "event_id"]).reset_index(drop=True)
    g = df_sorted.groupby("id")
    out = pd.DataFrame()

    print("‚úÖ Data sorted and grouped by 'id'.")
    print(f"Rows: {df_sorted.shape[0]} | Columns: {df_sorted.shape[1]}")

    # ==========================================================
    # 2Ô∏è‚É£ BASIC EVENT FEATURES
    # ==========================================================
    out["total_events"] = g["event_id"].count()
    out["writing_start"] = g["down_time"].min()
    out["writing_end"] = g["up_time"].max()
    out["total_time_spent_on_essay"] = out["writing_end"] - out["writing_start"]
    out["mean_action_time"] = g["action_time"].mean()
    out["sum_action_time"] = g["action_time"].sum()

    print("üïí Computed basic timing and event features.")
    display(out.head(1))

    # ==========================================================
    # 3Ô∏è‚É£ ACTIVITY TRANSFORM (MOVE VS NON-MOVE)
    # ==========================================================
    move_condition = df_sorted['activity_trf'].str.contains("move", case=False, na=False)
    out["non_move_count"] = (~move_condition).groupby(df_sorted['id']).sum()
    out["move_count"] = move_condition.groupby(df_sorted['id']).sum()

    non_move_activity_counts = df_sorted.loc[~move_condition, 'activity_trf'].unique()
    for activity in non_move_activity_counts:
        out[f"count_{activity}"] = g["activity_trf"].apply(lambda x: (x == activity).sum())

    print(f"üß≠ Added move/non-move activity stats ({len(non_move_activity_counts)} activity types).")
    display(out.head(1))

    # ==========================================================
    # 4Ô∏è‚É£ MOVE DISTANCE STATS
    # ==========================================================
    df_sorted["move_distance"] = (
        df_sorted["activity_trf"].str.extract(r"move_(\d+)").astype(float)
    )
    move_distance_stats = g["move_distance"].agg(
        sum_move_distance="sum",
        mean_move_distance="mean"
    )
    out = out.merge(move_distance_stats, on="id", how="left")
    out.loc[out["move_count"] == 0, "mean_move_distance"] = 0
    out.fillna({"sum_move_distance": 0, "mean_move_distance": 0}, inplace=True)

    print("üìè Computed move distance statistics.")
    display(out.head(1))

    # ==========================================================
    # 5Ô∏è‚É£ TEXT CHANGE DYNAMICS
    # ==========================================================
    out["non_q_tc_count"] = g["text_change_trf"].apply(lambda s: (~s.str.startswith("q")).sum())
    out["q_tc_count"] = g["text_change_trf"].apply(lambda s: s.str.startswith("q").sum())

    def parse_q_change(val):
        if isinstance(val, str):
            if val.startswith("q_add_"):
                return int(val.split("_")[-1])
            elif val.startswith("q_subtract_"):
                return -int(val.split("_")[-1])
        return 0

    df_sorted["q_delta"] = df_sorted["text_change_trf"].apply(parse_q_change)
    out["q_overall_delta"] = g["q_delta"].sum()

    print("‚úèÔ∏è Extracted text-change and q-delta features.")
    display(out.head(1))

    # ==========================================================
    # 6Ô∏è‚É£ CURSOR + WORD COUNT STATS
    # ==========================================================
    out["mean_cursor"] = g["cursor_position"].mean()
    out["std_cursor"] = g["cursor_position"].std()
    out["max_cursor"] = g["cursor_position"].max()

    wc_first = g["word_count"].first()
    wc_last = g["word_count"].last()
    out["final_word_count"] = wc_last
    out["max_word_count"] = g["word_count"].max()
    out["min_word_count"] = g["word_count"].min()
    out["std_word_count"] = g["word_count"].std()

    print("üñ±Ô∏è Added cursor and word count stats.")
    display(out.head(1))

    # ==========================================================
    # 7Ô∏è‚É£ DERIVED BEHAVIORAL RATIOS
    # ==========================================================
    out["words_per_event"] = out["final_word_count"] / out["total_events"].clip(lower=1)
    out["words_per_second"] = out["final_word_count"] / out["total_time_spent_on_essay"].clip(lower=1)

    out["edit_intensity"] = (
        out.get("count_cut", 0) + out.get("count_replace", 0) + out.get("count_nonproduction", 0)
    ) / out["total_events"].clip(lower=1)

    out["revision_ratio"] = (
        out.get("count_cut", 0) + out.get("count_replace", 0)
    ) / (out.get("count_input", 1) + 1)

    out["net_char_change_ratio"] = out["q_overall_delta"] / out["final_word_count"].clip(lower=1)
    out["q_activity_ratio"] = (
        (out.get("q_tc_count", 0) + out.get("non_q_tc_count", 0)) / out["total_events"].clip(lower=1)
    )

    out["cursor_movement_intensity"] = out["sum_move_distance"] / out["total_events"].clip(lower=1)
    out["avg_move_distance"] = out.get("mean_move_distance", 0)
    out["word_var_ratio"] = out["std_word_count"] / out["final_word_count"].clip(lower=1)
    out["time_per_word"] = out["total_time_spent_on_essay"] / out["final_word_count"].clip(lower=1)
    out["time_per_event"] = out["total_time_spent_on_essay"] / out["total_events"].clip(lower=1)

    out.replace([np.inf, -np.inf], 0, inplace=True)
    out.fillna(0, inplace=True)

    # Ensure ID is a column, not index
    if out.index.name == "id":
        out.reset_index(inplace=True)

    print("‚öôÔ∏è Derived higher-level behavioral ratios.")
    display(out.head(1))

    # ==========================================================
    # ‚úÖ SUMMARY
    # ==========================================================
    print("\n‚úÖ Aggregation complete!")
    print(f"Final shape: {out.shape[0]} rows √ó {out.shape[1]} columns")

    return out

In [None]:
# ==========================================================
#  STEP 1 ‚Üí ANALYSE RAW DATA
# ==========================================================
print("Step 1: Analysing data...")
df_train_logs_analysis = df_train_logs.copy()
analyse_data(df_train_logs_analysis)
print(f"‚Üí Shape after Step 1: {df_train_logs_analysis.shape}")

# ==========================================================
#  STEP 2 ‚Üí TRANSFORM DATA
# ==========================================================
print("\nStep 2: Transforming columns...")
df_train_logs_transformed = transform_data(df_train_logs_analysis.copy())
print(f"‚Üí Shape after Step 2: {df_train_logs_transformed.shape}")

# ==========================================================
#  STEP 3 ‚Üí CLEAN DATA
# ==========================================================
print("\nStep 3: Cleaning data...")
df_train_logs_cleaned = clean_data(df_train_logs_transformed.copy())
print(f"‚Üí Shape after Step 3: {df_train_logs_cleaned.shape}")

# ==========================================================
#  STEP 4 ‚Üí AGGREGATE EVENT-LEVEL FEATURES (Essay-Level)
# ==========================================================
print("\nStep 4: Aggregating event-level features...")
df_train_agg_logs = aggregate_data(df_train_logs_cleaned.copy())
print(f"‚Üí Shape after Step 4: {df_train_agg_logs.shape}")

# ==========================================================
#  FINAL SUMMARY
# ==========================================================
print("\n‚úÖ Preprocessing pipeline completed successfully!")
print(f"Final dataset shape: {df_train_agg_logs.shape}")
display(df_train_agg_logs.head(3))

In [None]:
# ==========================================================
#  STEP 1 ‚Üí ANALYSE RAW DATA (Test Set)
# ==========================================================
print("Step 1: Analysing data...")
df_test_logs_analysis = df_test_logs.copy()
analyse_data(df_test_logs_analysis)
print(f"‚Üí Shape after Step 1: {df_test_logs_analysis.shape}")

# ==========================================================
#  STEP 2 ‚Üí TRANSFORM DATA (Test Set)
# ==========================================================
print("\nStep 2: Transforming columns...")
df_test_logs_transformed = transform_data(df_test_logs_analysis.copy())
print(f"‚Üí Shape after Step 2: {df_test_logs_transformed.shape}")

# ==========================================================
#  STEP 3 ‚Üí CLEAN DATA (Test Set)
# ==========================================================
print("\nStep 3: Cleaning data...")
df_test_logs_cleaned = clean_data(df_test_logs_transformed.copy())
print(f"‚Üí Shape after Step 3: {df_test_logs_cleaned.shape}")

# ==========================================================
#  STEP 4 ‚Üí AGGREGATE EVENT-LEVEL FEATURES (Essay-Level) (Test Set)
# ==========================================================
print("\nStep 4: Aggregating event-level features...")
df_test_agg_logs = aggregate_data(df_test_logs_cleaned.copy())
print(f"‚Üí Shape after Step 4: {df_test_agg_logs.shape}")

# ==========================================================
#  FINAL SUMMARY (Test Set)
# ==========================================================
print("\n‚úÖ Preprocessing pipeline completed successfully for the test set!")
print(f"Final dataset shape: {df_test_agg_logs.shape}")
display(df_test_agg_logs.head(3))

## Reconstruction (Function)

In [None]:
import textwrap
from tqdm import tqdm
import pandas as pd

def getEssays(df, show_first=True):
    """
    Reconstructs full essay texts from event-level logs.
    Returns a Series indexed by essay IDs.
    """
    text_df = df[['id', 'activity', 'cursor_position', 'text_change']].copy()
    text_df = text_df[text_df.activity != 'Nonproduction']
    grouped = text_df.groupby('id', sort=False)

    essays = {}

    print(f"üß† Reconstructing {len(grouped)} essays...")
    for essay_id, group in tqdm(grouped, total=len(grouped), desc="Processing essays"):
        essay_text = ""
        group = group[['activity', 'cursor_position', 'text_change']].values

        for activity, cursor_pos, text_change in group:
            if activity == 'Replace':
                before, after = text_change.split(' => ')
                essay_text = essay_text[:cursor_pos - len(after)] + after + essay_text[cursor_pos - len(after) + len(before):]
                continue
            if activity == 'Paste':
                essay_text = essay_text[:cursor_pos - len(text_change)] + text_change + essay_text[cursor_pos - len(text_change):]
                continue
            if activity == 'Remove/Cut':
                essay_text = essay_text[:cursor_pos] + essay_text[cursor_pos + len(text_change):]
                continue
            if "Move" in activity:
                cropped = activity[10:]
                start, end = [seg.split(', ') for seg in cropped.split(' To ')]
                move_data = (int(start[0][1:]), int(start[1][:-1]),
                             int(end[0][1:]), int(end[1][:-1]))
                if move_data[0] != move_data[2]:
                    if move_data[0] < move_data[2]:
                        essay_text = essay_text[:move_data[0]] + essay_text[move_data[1]:move_data[3]] + essay_text[move_data[0]:move_data[1]] + essay_text[move_data[3]:]
                    else:
                        essay_text = essay_text[:move_data[2]] + essay_text[move_data[0]:move_data[1]] + essay_text[move_data[2]:move_data[0]] + essay_text[move_data[1]:]
                continue
            essay_text = essay_text[:cursor_pos - len(text_change)] + text_change + essay_text[cursor_pos - len(text_change):]

        essays[essay_id] = essay_text

    essays_series = pd.Series(essays, name='essay_text')

    # ‚úÖ Show only the first essay's text
    if show_first and not essays_series.empty:
        first_id = essays_series.index[0]
        print(f"\nüìù First reconstructed essay (ID: {first_id}):\n")
        print(textwrap.fill(essays_series.iloc[0][:1000], width=100))
        print("\n-----------------------------------------------\n")

    return essays_series

## Silver Bullet

In [None]:
import polars as pl
import pandas as pd
import numpy as np
import re
from lightgbm import LGBMRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from scipy.stats import skew, kurtosis
import warnings
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [None]:
num_cols = ['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count']
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']


def count_by_values(df, colname, values):
    fts = df.select(pl.col('id').unique(maintain_order=True))
    for i, value in enumerate(values):
        tmp_df = df.group_by('id').agg(pl.col(colname).is_in([value]).sum().alias(f'{colname}_{i}_cnt'))
        fts  = fts.join(tmp_df, on='id', how='left') 
    return fts


def dev_feats(df):
    
    print("< Count by values features >")
    
    feats = count_by_values(df, 'activity', activities)
    feats = feats.join(count_by_values(df, 'text_change', text_changes), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'down_event', events), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'up_event', events), on='id', how='left') 

    print("< Input words stats features >")

    temp = df.filter((~pl.col('text_change').str.contains('=>')) & (pl.col('text_change') != 'NoChange'))
    temp = temp.group_by('id').agg(pl.col('text_change').str.concat('').str.extract_all(r'q+'))
    temp = temp.with_columns(
        input_word_count = pl.col('text_change').list.len(), # changed from .lengths()
        input_word_length_mean = pl.col('text_change').map_elements(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0), return_dtype=pl.Float64), # changed from .apply()
        input_word_length_max = pl.col('text_change').map_elements(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0), return_dtype=pl.Float64),
        input_word_length_std = pl.col('text_change').map_elements(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0), return_dtype=pl.Float64),
        input_word_length_median = pl.col('text_change').map_elements(lambda x: np.median([len(i) for i in x] if len(x) > 0 else 0), return_dtype=pl.Float64),
        input_word_length_skew = pl.col('text_change').map_elements(lambda x: skew([len(i) for i in x] if len(x) > 0 else 0), return_dtype=pl.Float64)
    )
    temp = temp.drop('text_change')
    feats = feats.join(temp, on='id', how='left') 


    
    print("< Numerical columns features >")

    temp = df.group_by("id").agg(
        pl.sum('action_time').alias('action_time_sum'), # alias() instead of suffix()

        *[pl.mean(c).alias(f'{c}_mean') for c in num_cols],
        *[pl.std(c).alias(f'{c}_std') for c in num_cols],
        *[pl.median(c).alias(f'{c}_median') for c in num_cols],
        *[pl.min(c).alias(f'{c}_min') for c in num_cols],
        *[pl.max(c).alias(f'{c}_max') for c in num_cols],
        *[pl.quantile(c, 0.5).alias(f'{c}_quantile') for c in num_cols],
    )
    feats = feats.join(temp, on='id', how='left') 


    print("< Categorical columns features >")
    
    temp  = df.group_by("id").agg(pl.n_unique(['activity', 'down_event', 'up_event', 'text_change']))
    feats = feats.join(temp, on='id', how='left') 


    
    print("< Idle time features >")

    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.group_by("id").agg(inter_key_largest_lantency = pl.max('time_diff'),
                                   inter_key_median_lantency = pl.median('time_diff'),
                                   mean_pause_time = pl.mean('time_diff'),
                                   std_pause_time = pl.std('time_diff'),
                                   total_pause_time = pl.sum('time_diff'),
                                   pauses_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 0.5) & (pl.col('time_diff') < 1)).count(),
                                   pauses_1_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1) & (pl.col('time_diff') < 1.5)).count(),
                                   pauses_1_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1.5) & (pl.col('time_diff') < 2)).count(),
                                   pauses_2_sec = pl.col('time_diff').filter((pl.col('time_diff') > 2) & (pl.col('time_diff') < 3)).count(),
                                   pauses_3_sec = pl.col('time_diff').filter(pl.col('time_diff') > 3).count(),)
    feats = feats.join(temp, on='id', how='left') 
    
    print("< P-bursts features >")

    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.with_columns((pl.col("time_diff") < 2).alias("is_burst"))
    temp = temp.with_columns(pl.col("is_burst").rle_id().alias("burst_id"))
    temp = temp.with_columns(pl.count().over("burst_id").alias("P-bursts"))
    temp = temp.filter(pl.col("is_burst") == True)
    temp = temp.drop_nulls()
    temp = temp.group_by("id").agg(
        pl.mean("P-bursts").alias("P-bursts_mean"),
        pl.std("P-bursts").alias("P-bursts_std"),
        pl.count("P-bursts").alias("P-bursts_count"),
        pl.median("P-bursts").alias("P-bursts_median"),
        pl.max("P-bursts").alias("P-bursts_max"),
        pl.first("P-bursts").alias("P-bursts_first"),
        pl.last("P-bursts").alias("P-bursts_last"),
    )
    feats = feats.join(temp, on='id', how='left') 


    print("< R-bursts features >")

    temp = df.filter(pl.col("activity").is_in(['Input', 'Remove/Cut']))
    temp = temp.with_columns((pl.col("activity") == 'Remove/Cut').alias("is_remove"))
    temp = temp.with_columns(pl.col("is_remove").rle_id().alias("remove_id"))
    temp = temp.with_columns(pl.count().over("remove_id").alias("R-bursts"))
    temp = temp.filter(pl.col("is_remove"))
    temp = temp.group_by("id").agg(
        pl.mean("R-bursts").alias("R-bursts_mean"),
        pl.std("R-bursts").alias("R-bursts_std"),
        pl.median("R-bursts").alias("R-bursts_median"),
        pl.max("R-bursts").alias("R-bursts_max"),
        pl.first("R-bursts").alias("R-bursts_first"),
        pl.last("R-bursts").alias("R-bursts_last")
    )
    feats = feats.join(temp, on='id', how='left')
    
    return feats


In [None]:
def q1(x):
    return x.quantile(0.1)
def q2(x):
    return x.quantile(0.25)
def q7(x):
    return x.quantile(0.75)
def q9(x):
    return x.quantile(0.90)

AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q2, 'median', q7,'sum']

def reconstruct_essay(currTextInput):
    essayText = ""
    for Input in currTextInput.values:
        if Input[0] == 'Replace':
            replaceTxt = Input[2].split(' => ')
            essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
            continue
        if Input[0] == 'Paste':
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
            continue
        if Input[0] == 'Remove/Cut':
            essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
            continue
        if "M" in Input[0]:
            croppedTxt = Input[0][10:]
            splitTxt = croppedTxt.split(' To ')
            valueArr = [item.split(', ') for item in splitTxt]
            moveData = (int(valueArr[0][0][1:]), int(valueArr[0][1][:-1]), int(valueArr[1][0][1:]), int(valueArr[1][1][:-1]))
            if moveData[0] != moveData[2]:
                if moveData[0] < moveData[2]:
                    essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                else:
                    essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
            continue
        essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
    return essayText


def get_essay_df(df):
    df       = df[df.activity != 'Nonproduction']
    temp     = df.groupby('id').apply(lambda x: reconstruct_essay(x[['activity', 'cursor_position', 'text_change']]))
    essay_df = pd.DataFrame({'id': df['id'].unique().tolist()})
    essay_df = essay_df.merge(temp.rename('essay'), on='id')
    return essay_df


def word_feats(df):
    essay_df = df
    df['word'] = df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!',x))
    df = df.explode('word')
    df['word_len'] = df['word'].apply(lambda x: len(x))
    df = df[df['word_len'] != 0]

    word_agg_df = df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
    word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
    word_agg_df['id'] = word_agg_df.index
    word_agg_df = word_agg_df.reset_index(drop=True)
    return word_agg_df


def sent_feats(df):
    df['sent'] = df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
    df = df.explode('sent')
    df['sent'] = df['sent'].apply(lambda x: x.replace('\n','').strip())
    # Number of characters in sentences
    df['sent_len'] = df['sent'].apply(lambda x: len(x))
    # Number of words in sentences
    df['sent_word_count'] = df['sent'].apply(lambda x: len(x.split(' ')))
    df = df[df.sent_len!=0].reset_index(drop=True)

    sent_agg_df = pd.concat([df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), 
                             df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1)
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df


def parag_feats(df):
    df['paragraph'] = df['essay'].apply(lambda x: x.split('\n'))
    df = df.explode('paragraph')
    # Number of characters in paragraphs
    df['paragraph_len'] = df['paragraph'].apply(lambda x: len(x)) 
    # Number of words in paragraphs
    df['paragraph_word_count'] = df['paragraph'].apply(lambda x: len(x.split(' ')))
    df = df[df.paragraph_len!=0].reset_index(drop=True)
    
    paragraph_agg_df = pd.concat([df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), 
                                  df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

def product_to_keys(logs, essays):
    essays['product_len'] = essays.essay.str.len()
    tmp_df = logs[logs.activity.isin(['Input', 'Remove/Cut'])].groupby(['id']).agg({'activity': 'count'}).reset_index().rename(columns={'activity': 'keys_pressed'})
    essays = essays.merge(tmp_df, on='id', how='left')
    essays['product_to_keys'] = essays['product_len'] / essays['keys_pressed']
    return essays[['id', 'product_to_keys']]

def get_keys_pressed_per_second(logs):
    temp_df = logs[logs['activity'].isin(['Input', 'Remove/Cut'])].groupby(['id']).agg(keys_pressed=('event_id', 'count')).reset_index()
    temp_df_2 = logs.groupby(['id']).agg(min_down_time=('down_time', 'min'), max_up_time=('up_time', 'max')).reset_index()
    temp_df = temp_df.merge(temp_df_2, on='id', how='left')
    temp_df['keys_per_second'] = temp_df['keys_pressed'] / ((temp_df['max_up_time'] - temp_df['min_down_time']) / 1000)
    return temp_df[['id', 'keys_per_second']]


def target_encoding(train_df, scores, feature):
    
    train_df['target'] = train_df['id'].map(dict(scores.values))
    
    down_event_counts = train_df[feature].value_counts()
    rare_down_events = down_event_counts[down_event_counts <= 3].index
    # Replace 'target' values with NaN for these rare events
    train_df.loc[train_df[feature].isin(rare_down_events), 'target'] = np.nan    

    # Step 2: Calculate the mean 'target' for each 'down_event'
    mean_target_by_down_event = train_df.groupby(feature)['target'].mean().reset_index(name=f'{feature}_mean_target')
    train_df.drop(columns=["target"], inplace=True)
    
    return mean_target_by_down_event

In [None]:
%%time

print('< Read Data >')
data_path = '/kaggle/input/linking-writing-processes-to-writing-quality/'

# Train
train_logs = pl.scan_csv(data_path + 'train_logs.csv')
train_feats = dev_feats(train_logs)
train_feats = train_feats.collect().to_pandas()
train_scores = pd.read_csv(data_path + 'train_scores.csv')
train_logs = train_logs.collect().to_pandas().drop([850340],axis=0)

# Test
test_logs = pl.scan_csv(data_path + 'test_logs.csv')
test_feats = dev_feats(test_logs)
test_feats = test_feats.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()

print('< Train Features >')
train_essays = get_essay_df(train_logs)
train_feats = train_feats.merge(word_feats(train_essays), on='id', how='left')
train_feats = train_feats.merge(sent_feats(train_essays), on='id', how='left')
train_feats = train_feats.merge(parag_feats(train_essays), on='id', how='left')
train_feats = train_feats.merge(get_keys_pressed_per_second(train_logs), on='id', how='left')
df_train_SB = train_feats.merge(product_to_keys(train_logs, train_essays), on='id', how='left')
    
print('< Test Features >')
test_essays = get_essay_df(test_logs)
test_feats = test_feats.merge(word_feats(test_essays), on='id', how='left')
test_feats = test_feats.merge(sent_feats(test_essays), on='id', how='left')
test_feats = test_feats.merge(parag_feats(test_essays), on='id', how='left')
test_feats = test_feats.merge(get_keys_pressed_per_second(test_logs), on='id', how='left')
df_test_SB = test_feats.merge(product_to_keys(test_logs, test_essays), on='id', how='left')

print("SB Process Complete")

display(df_train_SB)
display(df_test_SB)

In [None]:
import re, math, numpy as np, pandas as pd

def enrich_full_text_features_parallel(df, show_preview=True):
    """
    Parallelized full essay feature extractor (~84 features total).
    Combines every linguistic, structural, and punctuation-based feature
    from your raw pipeline into a single efficient parallelized pass.

    Requires: swifter (optional, auto-fallback if not installed)
    """

    # ---------- Safe import of swifter ----------
    try:
        import swifter
        use_swifter = True
        print("‚ö° Using swifter for parallel processing")
    except ImportError:
        use_swifter = False
        print("‚ÑπÔ∏è swifter not installed ‚Äî using normal .apply() (slower)")

    df = df.copy()

    # ---------- Inner per-essay feature extractor ----------
    def _extract_features(text: str):
        text = str(text)
        sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
        n_sent = len(sentences)
        paragraphs = [p.strip() for p in re.split(r"(?:\r?\n\s*\r?\n)+", text.strip()) if p.strip()]
        n_par = len(paragraphs)
        words = re.findall(r"\b[a-zA-Z]+\b", text)
        n_words = len(words)

        per100_tokens = lambda n: (n / n_words * 100.0) if n_words > 0 else 0.0
        per100_sents  = lambda n: (n / n_sent * 100.0) if n_sent > 0 else 0.0
        tok_count = lambda s: len(re.findall(r"\b[a-zA-Z]+\b", s))
        internal_punct_count = lambda s: s.count(",") + s.count(";") + s.count(":")

        # ---------- Basic counts ----------
        num_words, num_sentences, num_paragraphs = n_words, n_sent, n_par

        if n_sent:
            lengths = [tok_count(s) for s in sentences]
            mean_sentence_len = np.mean(lengths)
            std_sentence_len  = np.std(lengths)
            cv_sentence_len   = std_sentence_len / mean_sentence_len if mean_sentence_len > 0 else 0
            short_sent_share  = np.mean(np.array(lengths) <= 5)
            long_sent_share   = np.mean(np.array(lengths) >= 20)
        else:
            mean_sentence_len = std_sentence_len = cv_sentence_len = short_sent_share = long_sent_share = 0.0

        # ---------- Paragraph structure ----------
        if n_par:
            sent_per_para = [len([s for s in re.split(r"[.!?]+", p) if s.strip()]) for p in paragraphs]
            word_counts = [tok_count(p) for p in paragraphs]
            avg_sent_per_para = np.mean(sent_per_para)
            var_sent_per_para = np.var(sent_per_para)
            intro_para_len = word_counts[0]
            body_para_mean_len = np.mean(word_counts[1:-1]) if n_par > 2 else 0
            conclusion_para_len = word_counts[-1] if n_par > 1 else 0
        else:
            avg_sent_per_para = var_sent_per_para = intro_para_len = body_para_mean_len = conclusion_para_len = 0.0

        # ---------- Comma density ----------
        num_commas = text.count(",")
        commas_per_sentence = num_commas / n_sent if n_sent else 0
        commas_per_100_words = per100_tokens(num_commas)
        multi_clause_sent_share = np.mean([s.count(",") >= 2 for s in sentences]) if n_sent else 0

        # ---------- Semicolon / colon ----------
        num_semis, num_colons = text.count(";"), text.count(":")
        semicolons_per_100_tokens = per100_tokens(num_semis)
        colons_per_100_tokens = per100_tokens(num_colons)
        share_sents_with_semicolon = (sum(";" in s for s in sentences) / n_sent) if n_sent else 0
        share_sents_with_colon = (sum(":" in s for s in sentences) / n_sent) if n_sent else 0

        # ---------- Parentheses / quotes / dashes ----------
        SINGLE_QUOTES = ["'", "‚Äò", "‚Äô", "‚Äö", "‚Äõ"]
        DOUBLE_QUOTES = ['"', "‚Äú", "‚Äù", "‚Äû", "‚Äü"]
        DASHES = ["-", "‚Äì", "‚Äî"]
        left_paren, right_paren = text.count("("), text.count(")")
        parentheses = left_paren + right_paren
        single_q = sum(text.count(ch) for ch in SINGLE_QUOTES)
        double_q = sum(text.count(ch) for ch in DOUBLE_QUOTES)
        dashes = sum(text.count(ch) for ch in DASHES)
        counts = [parentheses, single_q, double_q, dashes]
        total = sum(counts)
        if total:
            p = [c / total for c in counts if c > 0]
            H = -sum(pi * math.log(pi, 2) for pi in p)
            H_norm = H / math.log(4, 2)
        else:
            H = H_norm = 0.0

        # ---------- Mechanics consistency ----------
        unmatched_parens_open = max(0, left_paren - right_paren)
        unmatched_parens_close = max(0, right_paren - left_paren)
        mismatched_parens_total = unmatched_parens_open + unmatched_parens_close

        text_no_apos = re.sub(r"(?<=\w)[\'‚Äô](?=\w)", "", text)
        straight_single = text_no_apos.count("'")
        straight_double = text_no_apos.count('"')
        unmatched_straight_single = straight_single % 2
        unmatched_straight_double = straight_double % 2
        left_single = text_no_apos.count("‚Äò")
        right_single = text_no_apos.count("‚Äô")
        left_double = text_no_apos.count("‚Äú")
        right_double = text_no_apos.count("‚Äù")
        mismatched_curly_single = abs(left_single - right_single)
        mismatched_curly_double = abs(left_double - right_double)
        mismatched_quotes_total = unmatched_straight_single + unmatched_straight_double + mismatched_curly_single + mismatched_curly_double

        def count_repeats(ch): return len(re.findall(re.escape(ch) + r"{2,}", text))
        repeated_commas = count_repeats(",")
        repeated_periods = len(re.findall(r"\.{2,}", text))
        repeated_semis = count_repeats(";")
        repeated_colons = count_repeats(":")
        repeated_qmarks = count_repeats(r"\?")
        repeated_exclaims = count_repeats("!")
        repeated_dashes = sum(count_repeats(ch) for ch in DASHES)
        repeated_punct_sequences_total = (
            repeated_commas + repeated_periods + repeated_semis + repeated_colons +
            repeated_qmarks + repeated_exclaims + repeated_dashes
        )
        repeated_punct_sequences_per_100_tokens = per100_tokens(repeated_punct_sequences_total)
        spaces_before_comma = len(re.findall(r"\s+,", text))
        spaces_before_punct_total = len(re.findall(r"\s+[,\.;:\?\!)]", text))
        spaces_before_punct_per_100_tokens = per100_tokens(spaces_before_punct_total)
        double_spaces_after_eos = len(re.findall(r"[.!?]\s{2,}", text))
        double_spaces_after_eos_per_100_sentences = per100_sents(double_spaces_after_eos)

        # ---------- Multi-clause proxy ----------
        if n_sent:
            counts_int = [internal_punct_count(s) for s in sentences]
            multi_clause_proxy_share = np.mean(np.array(counts_int) >= 2)
            any_internal_punct_share = np.mean(np.array(counts_int) >= 1)
            avg_internal_punct_per_sentence = np.mean(counts_int)
        else:
            multi_clause_proxy_share = any_internal_punct_share = avg_internal_punct_per_sentence = 0.0

        # ---------- Rhythm variety ----------
        if n_sent:
            sent_lengths = np.array([tok_count(s) for s in sentences], dtype=float)
            mean_len = sent_lengths.mean()
            std_len  = sent_lengths.std(ddof=0)
            cv_global = std_len / mean_len if mean_len > 0 else 0
            WINDOW = 5
            if n_sent < WINDOW:
                cvs = [cv_global]
            else:
                cvs = [(sent_lengths[i:i+WINDOW].std(ddof=0) /
                        sent_lengths[i:i+WINDOW].mean()) if sent_lengths[i:i+WINDOW].mean() > 0 else 0
                        for i in range(n_sent - WINDOW + 1)]
            cvs = np.array(cvs)
            cv_mw_mean   = cvs.mean()   if cvs.size else 0
            cv_mw_median = np.median(cvs) if cvs.size else 0
            cv_mw_max    = cvs.max()    if cvs.size else 0
            cv_mw_std    = cvs.std(ddof=0) if cvs.size else 0
        else:
            mean_len = std_len = cv_global = cv_mw_mean = cv_mw_median = cv_mw_max = cv_mw_std = 0.0

        # ---------- Local continuity / segmentation ----------
        para_sents = [[s.strip() for s in re.split(r"[.!?]+", p) if s.strip()] for p in paragraphs]
        if n_par == 0:
            single_sentence_paragraph_ratio = bridge_sentence_share = bridge_sentences_per_100_sentences = \
            heavy_internal_punct_sentence_share = heavy_at_paragraph_edges_share = heavy_sentence_mean_normalized_position = \
            semicolon_sentence_share = semicolon_at_paragraph_edges_share = colon_sentence_share = colon_at_paragraph_edges_share = 0.0
        else:
            single_sentence_paragraph_ratio = sum(len(ps) == 1 for ps in para_sents) / n_par
            all_sents = [s for ps in para_sents for s in ps]
            n_sent_total = len(all_sents)
            sent_lengths_all = [tok_count(s) for s in all_sents]
            bridge_flags = np.array(sent_lengths_all) <= 5
            bridge_sentence_share = bridge_flags.mean() if bridge_flags.size else 0
            bridge_sentences_per_100_sentences = bridge_sentence_share * 100
            heavy_flags = [internal_punct_count(s) >= 2 or ";" in s or ":" in s for s in all_sents]
            heavy_internal_punct_sentence_share = np.mean(heavy_flags) if n_sent_total else 0
            sent_meta = [(p_idx, i, len(ps)) for p_idx, ps in enumerate(para_sents) for i, _ in enumerate(ps)]
            heavy_idx = [i for i, h in enumerate(heavy_flags) if h]
            heavy_edges = sum(1 for gi in heavy_idx if sent_meta[gi][1] in (0, sent_meta[gi][2]-1))
            heavy_at_paragraph_edges_share = heavy_edges / len(heavy_idx) if heavy_idx else 0
            heavy_sentence_mean_normalized_position = np.mean([i/(n_sent_total-1) for i,h in enumerate(heavy_flags) if h]) if n_sent_total>1 else 0
            semi_flags = [";" in s for s in all_sents]
            colon_flags = [":" in s for s in all_sents]
            def edge_share(mask):
                idxs = [i for i,f in enumerate(mask) if f]
                return sum(1 for gi in idxs if sent_meta[gi][1] in (0, sent_meta[gi][2]-1)) / len(idxs) if idxs else 0
            semicolon_sentence_share = np.mean(semi_flags) if n_sent_total else 0
            semicolon_at_paragraph_edges_share = edge_share(semi_flags)
            colon_sentence_share = np.mean(colon_flags) if n_sent_total else 0
            colon_at_paragraph_edges_share = edge_share(colon_flags)

        # ---------- List / explanation patterns ----------
        colon_sents = [s for s in sentences if ":" in s]
        n_colon = len(colon_sents)
        if n_sent == 0:
            colon_sentence_share2 = list_like_all = list_like_among = semi_tail_share = avg_trailing = \
            items_mean = items_median = items_max = items_ge3 = 0.0
        else:
            colon_sentence_share2 = n_colon / n_sent
            list_like_flags, semi_tail_flags, trailing_counts, items_counts = [], [], [], []
            for s in colon_sents:
                _, tail = s.split(":", 1)
                commas, semis = tail.count(","), tail.count(";")
                total_internal = commas + semis
                trailing_counts.append(total_internal)
                semi_tail_flags.append(semis > 0)
                list_like_flags.append(total_internal >= 2)
                segments = [seg.strip() for seg in re.split(r"[;,]", tail)]
                items = [seg for seg in segments if re.search(r"\b[a-zA-Z]+\b", seg)]
                items_counts.append(len(items))
            list_like_all   = np.mean(list_like_flags) if n_sent else 0
            list_like_among = np.mean(list_like_flags) if n_colon else 0
            semi_tail_share = np.mean(semi_tail_flags) if n_colon else 0
            avg_trailing = np.mean(trailing_counts) if trailing_counts else 0
            items_mean   = np.mean(items_counts) if items_counts else 0
            items_median = np.median(items_counts) if items_counts else 0
            items_max    = np.max(items_counts) if items_counts else 0
            items_ge3    = np.mean(np.array(items_counts) >= 3) if items_counts else 0

        return pd.Series([
            num_words,num_sentences,num_paragraphs,
            mean_sentence_len,std_sentence_len,cv_sentence_len,short_sent_share,long_sent_share,
            avg_sent_per_para,var_sent_per_para,intro_para_len,body_para_mean_len,conclusion_para_len,
            commas_per_sentence,commas_per_100_words,multi_clause_sent_share,
            semicolons_per_100_tokens,colons_per_100_tokens,share_sents_with_semicolon,share_sents_with_colon,
            parentheses,left_paren,right_paren,single_q,double_q,dashes,H,H_norm,
            unmatched_parens_open,unmatched_parens_close,mismatched_parens_total,
            unmatched_straight_single,unmatched_straight_double,mismatched_curly_single,mismatched_curly_double,mismatched_quotes_total,
            repeated_commas,repeated_periods,repeated_semis,repeated_colons,repeated_qmarks,repeated_exclaims,repeated_dashes,
            repeated_punct_sequences_total,repeated_punct_sequences_per_100_tokens,
            spaces_before_comma,spaces_before_punct_total,spaces_before_punct_per_100_tokens,
            double_spaces_after_eos,double_spaces_after_eos_per_100_sentences,
            multi_clause_proxy_share,any_internal_punct_share,avg_internal_punct_per_sentence,
            mean_len,std_len,cv_global,cv_mw_mean,cv_mw_median,cv_mw_max,cv_mw_std,
            num_paragraphs,single_sentence_paragraph_ratio,bridge_sentence_share,bridge_sentences_per_100_sentences,
            heavy_internal_punct_sentence_share,heavy_at_paragraph_edges_share,heavy_sentence_mean_normalized_position,
            semicolon_sentence_share,semicolon_at_paragraph_edges_share,colon_sentence_share,colon_at_paragraph_edges_share,
            colon_sentence_share2,list_like_all,list_like_among,semi_tail_share,avg_trailing,
            items_mean,items_median,items_max,items_ge3
        ])

    # ---------- Column names ----------
    cols = [
        'num_words','num_sentences','num_paragraphs',
        'mean_sentence_len','std_sentence_len','cv_sentence_len','short_sent_share','long_sent_share',
        'avg_sent_per_para','var_sent_per_para','intro_para_len','body_para_mean_len','conclusion_para_len',
        'commas_per_sentence','commas_per_100_words','multi_clause_sent_share',
        'semicolons_per_100_tokens','colons_per_100_tokens','share_sents_with_semicolon','share_sents_with_colon',
        'parentheses_count','left_parentheses_count','right_parentheses_count','single_quotes_count','double_quotes_count','dashes_count','punct_diversity_shannon','punct_diversity_shannon_norm',
        'unmatched_parens_open','unmatched_parens_close','mismatched_parens_total',
        'unmatched_quotes_straight_single','unmatched_quotes_straight_double','mismatched_quotes_curly_single','mismatched_quotes_curly_double','mismatched_quotes_total',
        'repeated_commas_seq','repeated_periods_seq','repeated_semicolons_seq','repeated_colons_seq','repeated_qmarks_seq','repeated_exclaims_seq','repeated_dashes_seq',
        'repeated_punct_sequences_total','repeated_punct_sequences_per_100_tokens',
        'spaces_before_comma','spaces_before_punct_total','spaces_before_punct_per_100_tokens',
        'double_spaces_after_eos','double_spaces_after_eos_per_100_sentences',
        'multi_clause_proxy_share','any_internal_punct_share','avg_internal_punct_per_sentence',
        'sent_len_tokens_mean','sent_len_tokens_std','sent_len_tokens_cv_global',
        'sent_len_tokens_cv_mw_mean','sent_len_tokens_cv_mw_median','sent_len_tokens_cv_mw_max','sent_len_tokens_cv_mw_std',
        'num_paragraphs','single_sentence_paragraph_ratio','bridge_sentence_share','bridge_sentences_per_100_sentences',
        'heavy_internal_punct_sentence_share','heavy_at_paragraph_edges_share','heavy_sentence_mean_normalized_position',
        'semicolon_sentence_share','semicolon_at_paragraph_edges_share','colon_sentence_share','colon_at_paragraph_edges_share',
        'colon_sentence_share_2','list_like_colon_sentence_share_all','list_like_colon_sentence_share_among_colon',
        'semicolon_in_tail_share_among_colon','avg_trailing_commas_semis_per_colon_sent',
        'items_after_colon_mean','items_after_colon_median','items_after_colon_max','items_ge3_share_among_colon'
    ]

    # ---------- Parallel apply ----------
    if use_swifter:
        feature_df = df["essay_text"].swifter.progress_bar(True).apply(_extract_features)
    else:
        feature_df = df["essay_text"].apply(_extract_features)

    feature_df.columns = cols
    df_out = pd.concat([df.reset_index(drop=True), feature_df], axis=1)

    if show_preview:
        print(f"‚úÖ Feature enrichment complete: {len(cols)} new columns added for {len(df_out)} essays.")
        display(df_out.head(2)[['num_words','mean_sentence_len','multi_clause_proxy_share','items_after_colon_mean']])

    return df_out

## Derivation of Features From Research (Function)

In [None]:
# ==========================================================
# üöÄ FULL ESSAY RECONSTRUCTION + FEATURE ENRICHMENT PIPELINE
# ==========================================================

# ‚úÖ Step 0: Start from original logs
df_train_logs_copy = df_train_logs.copy()
print("üìò Step 0: Original df_train_logs shape:", df_train_logs_copy.shape)

# ‚úÖ Step 1: Essay reconstruction
df_train_recon_logs_raw = getEssays(df_train_logs_copy.copy()).to_frame(name='essay_text')
df_train_recon_logs_raw.index.name = 'id'
df_train_recon_logs_raw.reset_index(inplace=True)  # ensure 'id' is a proper column
print("‚úÖ Step 1: Essays reconstructed ‚Äî shape:", df_train_recon_logs_raw.shape)

# ==========================================================
# ‚úÖ Step 2: Unified feature enrichment (parallelized)
# ==========================================================
print("‚öôÔ∏è Step 2: Extracting full linguistic + structural + mechanics features (parallelized)...")
df_train_recon_logs = enrich_full_text_features_parallel(df_train_recon_logs_raw.copy(), show_preview=False)
print("‚úÖ Step 2: Feature enrichment complete ‚Äî shape:", df_train_recon_logs.shape)

# ==========================================================
# ‚úÖ Step 3: Check for duplicate columns
# ==========================================================
dupes = df_train_recon_logs.columns[df_train_recon_logs.columns.duplicated()]

if len(dupes) > 0:
    from collections import Counter
    dupe_counts = Counter(dupes)
    print(f"\n‚ö†Ô∏è Found {len(dupe_counts)} duplicate column names:")
    for name, count in list(dupe_counts.items())[:15]:
        print(f"   üß© {name} ‚Üí appears {count} times")
    if len(dupe_counts) > 15:
        print("   ... (truncated)")
    
    # Drop duplicates (keep first occurrence)
    before = df_train_recon_logs.shape[1]
    df_train_recon_logs = df_train_recon_logs.loc[:, ~df_train_recon_logs.columns.duplicated()]
    after = df_train_recon_logs.shape[1]
    print(f"üßπ Removed {before - after} duplicate columns. Final shape: {df_train_recon_logs.shape}")
else:
    print("\n‚úÖ No duplicate columns detected in df_train_recon_logs.")

# ==========================================================
# ‚úÖ SUMMARY
# ==========================================================
print("\nüéØ Pipeline complete! Final dataset ‚Üí df_train_recon_logs")
print(f"üß© Step 0: df_train_logs_copy shape: {df_train_logs_copy.shape}")
print(f"üß© Step 1: df_train_recon_logs_raw shape: {df_train_recon_logs_raw.shape}")
print(f"üß© Step 2: df_train_recon_logs (final) shape: {df_train_recon_logs.shape}")

# üß† Sanity check
print("üß† Total essays:", df_train_recon_logs.shape[0])
print("üß© Total new features:", df_train_recon_logs.shape[1] - 2)  # exclude id + essay_text

# ‚úÖ Optional preview
display(df_train_recon_logs.head(2)[['id', 'essay_text'] + df_train_recon_logs.columns[2:12].tolist()])

# (Optional) Save for reuse
# df_train_recon_logs.to_csv("/kaggle/working/df_train_recon_logs.csv", index=False)
# print("üíæ Saved df_train_recon_logs.csv")

In [None]:
# ==========================================================
# üöÄ FULL ESSAY RECONSTRUCTION + FEATURE ENRICHMENT PIPELINE (Test)
# ==========================================================

# ‚úÖ Step 0: Start from original logs
df_test_logs_copy = df_test_logs.copy()
print("üìò Step 0: Original df_test_logs shape:", df_test_logs_copy.shape)

# ‚úÖ Step 1: Essay reconstruction
df_test_recon_logs_raw = getEssays(df_test_logs_copy.copy()).to_frame(name='essay_text')
df_test_recon_logs_raw.index.name = 'id'
df_test_recon_logs_raw.reset_index(inplace=True)  # ensure 'id' is a proper column
print("‚úÖ Step 1: Essays reconstructed ‚Äî shape:", df_test_recon_logs_raw.shape)

# ==========================================================
# ‚úÖ Step 2: Unified feature enrichment (parallelized)
# ==========================================================
print("‚öôÔ∏è Step 2: Extracting full linguistic + structural + mechanics features (parallelized)...")
df_test_recon_logs = enrich_full_text_features_parallel(df_test_recon_logs_raw.copy(), show_preview=False)
print("‚úÖ Step 2: Feature enrichment complete ‚Äî shape:", df_test_recon_logs.shape)

# ==========================================================
# ‚úÖ Step 3: Check for duplicate columns
# ==========================================================
dupes = df_test_recon_logs.columns[df_test_recon_logs.columns.duplicated()]

if len(dupes) > 0:
    from collections import Counter
    dupe_counts = Counter(dupes)
    print(f"\n‚ö†Ô∏è Found {len(dupe_counts)} duplicate column names:")
    for name, count in list(dupe_counts.items())[:15]:
        print(f"   üß© {name} ‚Üí appears {count} times")
    if len(dupe_counts) > 15:
        print("   ... (truncated)")
    
    # Drop duplicates (keep first occurrence)
    before = df_test_recon_logs.shape[1]
    df_test_recon_logs = df_test_recon_logs.loc[:, ~df_test_recon_logs.columns.duplicated()]
    after = df_test_recon_logs.shape[1]
    print(f"üßπ Removed {before - after} duplicate columns. Final shape: {df_test_recon_logs.shape}")
else:
    print("\n‚úÖ No duplicate columns detected in df_test_recon_logs.")

# ==========================================================
# ‚úÖ SUMMARY
# ==========================================================
print("\nüéØ Pipeline complete! Final dataset ‚Üí df_test_recon_logs")
print(f"üß© Step 0: df_test_logs_copy shape: {df_test_logs_copy.shape}")
print(f"üß© Step 1: df_test_recon_logs_raw shape: {df_test_recon_logs_raw.shape}")
print(f"üß© Step 2: df_test_recon_logs (final) shape: {df_test_recon_logs.shape}")

# üß† Sanity check
print("üß† Total essays:", df_test_recon_logs.shape[0])
print("üß© Total new features:", df_test_recon_logs.shape[1] - 2)  # exclude id + essay_text

# ‚úÖ Optional preview
display(df_test_recon_logs.head(2)[['id', 'essay_text'] + df_test_recon_logs.columns[2:12].tolist()])

# (Optional) Save for reuse
# df_test_recon_logs.to_csv("/kaggle/working/df_test_recon_logs.csv", index=False)
# print("üíæ Saved df_test_recon_logs.csv")

## Apply Raw Tokenization

In [None]:
def reconstruct_essay(currTextInput):
    essayText = ""
    for Input in currTextInput.values:
        if Input[0] == 'Replace':
            replaceTxt = Input[2].split(' => ')
            essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
            continue
        if Input[0] == 'Paste':
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
            continue
        if Input[0] == 'Remove/Cut':
            essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
            continue
        if "M" in Input[0]:
            croppedTxt = Input[0][10:]
            splitTxt = croppedTxt.split(' To ')
            valueArr = [item.split(', ') for item in splitTxt]
            moveData = (int(valueArr[0][0][1:]), int(valueArr[0][1][:-1]), int(valueArr[1][0][1:]), int(valueArr[1][1][:-1]))
            if moveData[0] != moveData[2]:
                if moveData[0] < moveData[2]:
                    essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                else:
                    essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
            continue
        essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
    return essayText

def get_essay_df(df):
    # Filter out 'Nonproduction' activities, group by 'id', and apply 'reconstruct_essay' function
    df_essay = df[df.activity != 'Nonproduction'].groupby('id').apply(lambda x: reconstruct_essay(x[['activity', 'cursor_position', 'text_change']]))
    
    # Reset the index and rename the column to 'essay'
    df_essay = df_essay.reset_index(name='essay')
    
    return df_essay

## NOTE: Simplified logic for get_essay_df

In [None]:
%%time

df_train_essays = get_essay_df(df_train_logs)
df_test_essays = get_essay_df(df_test_logs)

display(df_train_essays)
display(df_test_essays)

# id and essay

In [None]:
%%time

df_train_to_tokenise = df_train_essays[['id', 'essay']].copy()
df_test_to_tokenise = df_test_essays[['id', 'essay']].copy()

# Step 1: Tokenize the text using CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1, 4), analyzer='char_wb')

# Transform essays into frequency vectors
X_tokenizer_train = count_vectorizer.fit_transform(df_train_to_tokenise['essay']).todense()
X_tokenizer_test = count_vectorizer.transform(df_test_to_tokenise['essay']).todense()

In [None]:
# Step 1: Convert the sparse matrix to DataFrame
df_train_tokenised = pd.DataFrame(X_tokenizer_train, columns=count_vectorizer.get_feature_names_out())
df_test_tokenised = pd.DataFrame(X_tokenizer_test, columns=count_vectorizer.get_feature_names_out())

# Step 2: Rename columns to "feature 0", "feature 1", "feature 2", etc.
df_train_tokenised.columns = [f"feature {i}" for i in range(df_train_tokenised.shape[1])]
df_test_tokenised.columns = [f"feature {i}" for i in range(df_test_tokenised.shape[1])]

# Step 3: Add the 'id' column to the training and test DataFrames at the start
df_train_tokenised.insert(0, 'id', df_train_essays['id'])
df_test_tokenised.insert(0, 'id', df_test_essays['id'])

# Display the results
display(df_train_tokenised)
display(df_test_tokenised)

## Apply Linear Discriminant Analysis (LDA)

In [None]:
df_train_LDA = df_train_essays[['id']].copy()
df_test_LDA = df_test_essays[['id']].copy()

In [None]:
display(df_train_LDA)
display(df_test_LDA)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
CouVec = CountVectorizer(stop_words='english')
df = pd.concat([df_train_essays,df_test_essays])
CouVec.fit(df['essay'])
train_words = pd.DataFrame(CouVec.transform(df_train_essays['essay']).toarray())
test_words = pd.DataFrame(CouVec.transform(df_test_essays['essay']).toarray())

from sklearn.decomposition import LatentDirichletAllocation
n_clusters = 6
LDA = LatentDirichletAllocation(n_components=n_clusters, max_iter=10, random_state=42, verbose=True)
LDA.fit(pd.concat([train_words,test_words]))
Topics = [f'Topic_{x}' for x in range(0,n_clusters)]
df_train_LDA[Topics] = LDA.transform(train_words)
df_test_LDA[Topics] = LDA.transform(test_words)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
CouVec = CountVectorizer(analyzer='char_wb')
df = pd.concat([df_train_essays,df_test_essays])
CouVec.fit(df['essay'])
train_words = pd.DataFrame(CouVec.transform(df_train_essays['essay']).toarray())
test_words = pd.DataFrame(CouVec.transform(df_test_essays['essay']).toarray())

from sklearn.decomposition import LatentDirichletAllocation
n_clusters = 6
LDA = LatentDirichletAllocation(n_components=n_clusters, max_iter=10, random_state=42, verbose=True)
LDA.fit(pd.concat([train_words,test_words]))
Topics = [f'_Topic_{x}' for x in range(0,n_clusters)]
df_train_LDA[Topics] = LDA.transform(train_words)
df_test_LDA[Topics] = LDA.transform(test_words)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
CouVec = CountVectorizer(analyzer='char_wb',ngram_range=(5, 6))
df = pd.concat([df_train_essays,df_test_essays])
CouVec.fit(df['essay'])
train_words = pd.DataFrame(CouVec.transform(df_train_essays['essay']).toarray())
test_words = pd.DataFrame(CouVec.transform(df_test_essays['essay']).toarray())

from sklearn.decomposition import LatentDirichletAllocation
n_clusters = 6
LDA = LatentDirichletAllocation(n_components=n_clusters, max_iter=10, random_state=42, verbose=True)
LDA.fit(pd.concat([train_words,test_words]))
Topics = [f'__Topic_{x}' for x in range(0,n_clusters)]
df_train_LDA[Topics] = LDA.transform(train_words)
df_test_LDA[Topics] = LDA.transform(test_words)

## Obtain DeBERTa Embeddings (Function)

In [None]:
import os, torch, numpy as np, pandas as pd
from transformers import AutoTokenizer, AutoModel
from math import ceil
from pathlib import Path

def add_deberta_embeddings(
    df,
    model_dir="/kaggle/input/deberta-v3-fast-tokenizer-copy/deb-v3",
    text_col="essay_text",
    id_col="id",
    max_len=256,
    batch_size=8,
    show_preview=True
):
    """
    Adds mean-pooled DeBERTa-v3 embeddings as new columns (deb_emb_0 ... deb_emb_767)
    to a DataFrame containing essays.

    Internal behavior:
      - Replaces all standalone 'q' tokens with 'i' *only for embedding computation*
      - Original text in df is NOT modified or returned altered
    """

    assert {id_col, text_col}.issubset(df.columns), f"Missing {id_col} or {text_col}"
    df = df.copy()  # prevent in-place mutation

    # =====================
    # ‚öôÔ∏è Setup
    # =====================
    os.environ["HF_HUB_OFFLINE"] = "1"
    os.environ["TRANSFORMERS_OFFLINE"] = "1"

    print(f"\nüîß Loading DeBERTa model from: {model_dir}")
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print("üíª Device:", DEVICE)

    tok = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
    model = AutoModel.from_pretrained(model_dir, local_files_only=True).to(DEVICE).eval()

    # =====================
    # üß† Internal helper: q ‚Üí i replacement
    # =====================
    @torch.inference_mode()
    def preprocess_texts(texts):
        """
        Replace lowercase standalone 'q' with 'i' before embedding.
        This change is temporary and not persisted to the DataFrame.
        """
        return [pd.Series(t).astype(str).str.replace(r'\bq\b', 'i', regex=True).iloc[0].strip() for t in texts]

    @torch.inference_mode()
    def masked_mean_pool(last_hidden_state, mask):
        mask = mask.unsqueeze(-1)
        summed = (last_hidden_state * mask).sum(1)
        count = mask.sum(1).clamp(min=1e-9)
        return summed / count

    @torch.inference_mode()
    def embed_texts(texts):
        all_embs = []
        n = len(texts)
        for b in range(ceil(n / batch_size)):
            batch = texts[b * batch_size:(b + 1) * batch_size]
            enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt").to(DEVICE)
            out = model(**enc)
            pooled = masked_mean_pool(out.last_hidden_state, enc["attention_mask"])
            all_embs.append(pooled.cpu().numpy())
        return np.vstack(all_embs).astype("float32")

    # =====================
    # üß© Embed essays (with temporary cleaned text)
    # =====================
    texts_original = df[text_col].astype(str).tolist()
    texts_cleaned  = preprocess_texts(texts_original)  # temporary replacement
    ids = df[id_col].values

    print(f"\nüìù Embedding {len(texts_cleaned)} essays | max_len={max_len}, batch_size={batch_size}")

    X_emb = embed_texts(texts_cleaned)
    emb_df = pd.DataFrame(X_emb, columns=[f"deb_emb_{i}" for i in range(X_emb.shape[1])])
    emb_df[id_col] = ids

    df_out = df.merge(emb_df, on=id_col, how="left")

    # =====================
    # ‚úÖ Verification output
    # =====================
    print(f"‚úÖ Done! Added {X_emb.shape[1]} embedding columns.")
    print(f"üìä Output shape: {df_out.shape}")
    if show_preview:
        print("\nüîç Preview of first 2 rows and first 5 embedding dims:")
        display(df_out[[id_col, text_col] + [f"deb_emb_{i}" for i in range(5)]].head(2))

    return df_out

In [None]:
# ==========================================================
#  STEP: Add DeBERTa Embeddings to Reconstructed Essays
# ==========================================================

print("‚öôÔ∏è  Applying DeBERTa embeddings to training essays...")

# Apply to df_train_recon_logs
df_train_recon_D_logs = add_deberta_embeddings(df_train_recon_logs.copy())

# ‚úÖ Verification
print("\n‚úÖ Embedding process complete!")
print("üìä Final DataFrame shape:", df_train_recon_D_logs.shape)
print("üß© Sample of new columns added:")
print([col for col in df_train_recon_D_logs.columns if col.startswith("deb_emb_")][:10])

# Optional ‚Äî sanity check for alignment
id_check = df_train_recon_D_logs["id"].equals(df_train_recon_logs["id"])
print(f"üîç ID alignment check passed? {id_check}")

# Preview
display(df_train_recon_D_logs.head(2))

In [None]:
# ==========================================================
#  STEP: Add DeBERTa Embeddings to Reconstructed Essays (Test)
# ==========================================================

print("‚öôÔ∏è  Applying DeBERTa embeddings to test essays...")

# Apply to df_test_recon_logs
df_test_recon_D_logs = add_deberta_embeddings(df_test_recon_logs.copy())

# ‚úÖ Verification
print("\n‚úÖ Embedding process complete!")
print("üìä Final DataFrame shape:", df_test_recon_D_logs.shape)
print("üß© Sample of new columns added:")
print([col for col in df_test_recon_D_logs.columns if col.startswith("deb_emb_")][:10])

# Optional ‚Äî sanity check for alignment
id_check = df_test_recon_D_logs["id"].equals(df_test_recon_logs["id"])
print(f"üîç ID alignment check passed? {id_check}")

# Preview
display(df_test_recon_D_logs.head(2))

In [None]:
def merge_agg_and_deberta(df_agg, df_recon_D):
    """
    General-purpose merge of aggregated event-level logs and reconstructed essay
    datasets (with DeBERTa embeddings).

    Parameters
    ----------
    df_agg : pd.DataFrame
        Aggregated logs DataFrame (e.g., df_train_agg_logs or df_test_agg_logs)
    df_recon_D : pd.DataFrame
        Reconstructed essay DataFrame with linguistic + DeBERTa embeddings
        (e.g., df_train_recon_D_logs or df_test_recon_D_logs)

    Returns
    -------
    pd.DataFrame
        Merged dataset (e.g., df_train_full or df_test_full)
    """

    print("\n==========================================================")
    print("üöÄ FINAL MERGE: Aggregated Logs + Reconstructed DeBERTa Essays")
    print("==========================================================\n")

    # --- Make copies to avoid in-place modification ---
    df1 = df_agg.copy()
    df2 = df_recon_D.copy()

    # --- Step 0: Sanity check for 'id' column existence ---
    for name, df in zip(["Aggregated logs", "Reconstructed + DeBERTa"], [df1, df2]):
        if "id" not in df.columns:
            raise KeyError(f"‚ùå '{name}' missing 'id' column!")
        print(f"‚úÖ {name} shape: {df.shape}")
    print()

    # ==========================================================
    # STEP 1: Check essay_text alignment (optional, if exists)
    # ==========================================================
    if "essay_text" in df1.columns and "essay_text" in df2.columns:
        mismatch_mask = df1.set_index("id")["essay_text"] != df2.set_index("id")["essay_text"]
        mismatch_count = mismatch_mask.sum()
        if mismatch_count == 0:
            print("‚úÖ Essay text perfectly aligned ‚Äî using ['id', 'essay_text'] as merge keys.")
            join_cols = ["id", "essay_text"]
        else:
            print(f"‚ö†Ô∏è Essay text mismatch in {mismatch_count} rows ‚Äî using 'id' only.")
            join_cols = ["id"]
    else:
        print("‚öôÔ∏è Using 'id' as merge key (no essay_text overlap).")
        join_cols = ["id"]

    # ==========================================================
    # STEP 2: Perform merge
    # ==========================================================
    try:
        df_full = pd.merge(df1, df2, on=join_cols, how="left", validate="1:1")
        print(f"üìé Merge successful on {join_cols}. Shape: {df_full.shape}")
    except Exception as e:
        print(f"‚ùå Merge on {join_cols} failed: {e}")
        print("üîÅ Retrying merge on 'id' only...")
        df_full = pd.merge(df1, df2, on="id", how="left", validate="1:1")
        print(f"‚úÖ Fallback merge succeeded. Shape: {df_full.shape}")

    # ==========================================================
    # STEP 3: Drop duplicate columns automatically
    # ==========================================================
    dupes = df_full.columns[df_full.columns.duplicated()]
    if len(dupes) > 0:
        print(f"\n‚ö†Ô∏è Found {len(dupes)} duplicate column names:")
        print("   üß©", list(dupes[:10]), "..." if len(dupes) > 10 else "")
        df_full = df_full.loc[:, ~df_full.columns.duplicated()]
        print(f"üßπ Duplicates removed. Final shape: {df_full.shape}")
    else:
        print("\n‚úÖ No duplicate columns detected in merged dataset.")

    # ==========================================================
    # STEP 4: Alignment check
    # ==========================================================
    same_ids = df_full["id"].equals(df1["id"])
    missing_from_merge = df1[~df1["id"].isin(df_full["id"])]

    print("\nüîç Alignment verification:")
    print(f" - ID alignment maintained? {same_ids}")
    print(f" - Missing IDs after merge: {len(missing_from_merge)}")

    # ==========================================================
    # ‚úÖ Final summary
    # ==========================================================
    print("\nüéØ Merge completed successfully!")
    print(f"üìä Final merged dataset shape: {df_full.shape}")
    print(f"üîó Merge keys used: {join_cols}")

    # --- Optional preview ---
    display(df_full.head(2))

    # --- Optional essay text check ---
    essay_cols = [col for col in df_full.columns if "essay_text" in col]
    print(f"\nüß© Essay text-related columns: {essay_cols}")

    return df_full

In [None]:
import numpy as np
import pandas as pd

def prepare_dataframe(df: pd.DataFrame, target_col: str = None):
    """
    General-purpose DataFrame preparation function.
    Performs:
      - Ensures 'id' is a column (not index)
      - Checks duplicate columns and duplicate IDs
      - Optionally coerces target to numeric (if provided)
      - Detects which columns have NaNs or Infs
      - Replaces inf / -inf with NaN and fills NaN with 0 (features only)
      - Leaves 'id' and 'essay_text' untouched

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame (train or test)
    target_col : str or None
        Target column name (e.g. 'score' for train).
        If None, skips target-related checks.

    Returns
    -------
    pd.DataFrame
        Cleaned and ready DataFrame.
    """
    print("üßπ Stage 0: Data Preparation & Sanity Checks")

    df = df.copy()

    # ======================================================
    # üÜî Ensure 'id' column exists
    # ======================================================
    if df.index.name == "id" or "id" not in df.columns:
        if df.index.name == "id":
            df = df.reset_index()
            print("‚Ü™Ô∏è  Reset index: moved 'id' from index to column.")
    if "id" not in df.columns:
        raise KeyError("‚ùå Missing required column: 'id'")

    # ======================================================
    # ‚ö†Ô∏è Handle duplicates
    # ======================================================
    dup_cols = df.columns[df.columns.duplicated()].tolist()
    if dup_cols:
        print(f"‚ö†Ô∏è Found duplicate columns (kept first occurrence): {dup_cols}")
        df = df.loc[:, ~df.columns.duplicated()]

    dup_ids = df["id"][df["id"].duplicated()].unique()
    if len(dup_ids) > 0:
        print(f"‚ö†Ô∏è Found {len(dup_ids)} duplicated IDs. Keeping first occurrence.")
        df = df.drop_duplicates(subset=["id"], keep="first")

    # ======================================================
    # üéØ Target column (optional)
    # ======================================================
    if target_col:
        if target_col not in df.columns:
            raise KeyError(f"‚ùå Missing target column: '{target_col}'")

        before_non_numeric = df[target_col].dtype
        df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
        if str(before_non_numeric) != str(df[target_col].dtype):
            print(f"‚ÑπÔ∏è  Coerced '{target_col}' from {before_non_numeric} ‚Üí {df[target_col].dtype}")

    # ======================================================
    # üß© Feature subset (exclude protected columns)
    # ======================================================
    protect_cols = {"id", "essay_text"}
    if target_col:
        protect_cols.add(target_col)
    feature_cols = [c for c in df.columns if c not in protect_cols]

    # ======================================================
    # üîç Detect NaNs and Infs before cleaning
    # ======================================================
    inf_mask = np.isinf(df[feature_cols].to_numpy())
    inf_cols = df[feature_cols].columns[np.any(inf_mask, axis=0)].tolist()

    nan_mask = df[feature_cols].isna()
    nan_cols = nan_mask.columns[nan_mask.any()].tolist()

    inf_count = np.isinf(df[feature_cols].to_numpy()).sum()
    nan_count = df[feature_cols].isna().sum().sum()

    if inf_count or nan_count:
        print(f"‚ö†Ô∏è Detected issues in feature columns:")
        if inf_count:
            print(f"   ‚àû Infs: {inf_count} total, in {len(inf_cols)} columns.")
            print(f"      ‚Ü≥ Columns with inf values: {inf_cols[:10]}{' ...' if len(inf_cols) > 10 else ''}")
        if nan_count:
            print(f"   üï≥Ô∏è NaNs: {nan_count} total, in {len(nan_cols)} columns.")
            nan_counts_per_col = df[feature_cols].isna().sum()
            nan_counts_top = nan_counts_per_col[nan_counts_per_col > 0].sort_values(ascending=False).head(10)
            print("      ‚Ü≥ Top NaN columns (count):")
            for col, cnt in nan_counts_top.items():
                print(f"         - {col}: {cnt}")
        print("‚Ü™Ô∏è  Cleaning features: replacing inf ‚Üí NaN ‚Üí 0")
    else:
        print("‚úÖ No NaN or inf values detected in feature columns.")

    # ======================================================
    # üßπ Replace inf and NaN
    # ======================================================
    df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

    # ======================================================
    # üìä Summary
    # ======================================================
    print(f"\n‚úÖ Data ready. Shape: {df.shape}")
    print(f"üî¢ Features (excl. protected cols): {len(feature_cols)}")
    print(f"üÜî Unique IDs: {df['id'].nunique()}  |  Rows: {len(df)}")

    if target_col:
        print(f"üéØ Target '{target_col}' ‚Äî min: {df[target_col].min():.4f}, max: {df[target_col].max():.4f}")

    return df

In [None]:
# Merging the dataframes on the 'id' column
df_train_combined = df_train_tokenised.merge(df_train_SB, on='id', how='left')
df_train_combined = df_train_combined.merge(df_train_LDA, on='id', how='left')
df_train_combined_with_scores = df_train_combined.merge(df_train_scores, on='id', how='left')
# Display the resulting dataframe
display(df_train_combined_with_scores)

# Merging the dataframes on the 'id' column
df_test_combined = df_test_tokenised.merge(df_test_SB, on='id', how='left')
df_test_combined = df_test_combined.merge(df_test_LDA, on='id', how='left')
# Display the resulting dataframe
display(df_test_combined)

In [None]:
# Merge the two DataFrames on 'id' first
df_train_full_unchecked = pd.merge(df_train_agg_logs, df_train_recon_D_logs, on="id", how="left")

# Apply the prepare_dataframe function to the merged DataFrame
df_train_combined_new = prepare_dataframe(df_train_full_unchecked.copy(), target_col=None)  # No target column for merged features
df_train_combined_with_scores_new = df_train_combined_new.merge(df_train_scores, on='id', how='left')

# Display the resulting dataframe
display(df_train_combined_with_scores_new)

# Merge the two DataFrames on 'id' first
df_test_full_unchecked = pd.merge(df_test_agg_logs, df_test_recon_D_logs, on="id", how="left")

# List of the new columns to add
new_columns = ['count_cut', 'count_nonproduction', 'count_paste', 'count_replace']

# Check if the columns already exist, and if not, add them with default value 0
for col in new_columns:
    if col not in df_test_full_unchecked.columns:
        df_test_full_unchecked[col] = 0  # Set all values to 0 for non-existing columns

# Reorder columns in df_test_full_unchecked to match the order of df_train_full
df_test_full_unchecked = df_test_full_unchecked[df_train_combined_with_scores_new.drop(columns=['score']).columns]

# Apply the prepare_dataframe function to the merged DataFrame
df_test_combined_new = prepare_dataframe(df_test_full_unchecked.copy(), target_col=None)  # No target column for merged features

# ‚úÖ Final check and summary
display(df_test_combined_new)

# 3. Training

## Conduct Leave-One-Feature-Out (LOFO)

In [None]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available! Running on: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available, running on CPU.")

In [None]:
'''
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm # Use the notebook version for progress bar
import numpy as np
import copy # Import copy to ensure model is fresh for each training run

# Assuming df_train_combined_with_scores_new is defined and loaded
# Assuming tqdm.notebook is correctly imported

# --- 1. Data Preparation ---
# X: Features (drop 'id' and 'score' columns from df_train_combined_with_scores_new)
X = df_train_combined_with_scores_new.drop(columns=['id', 'score']).copy()
y = df_train_combined_with_scores_new['score']

# üö® FIX: Drop non-numerical/text columns (e.g., 'essay_text').
# XGBoost can only handle numerical or explicit categorical dtypes.
text_cols_to_drop = X.select_dtypes(include=['object']).columns

if len(text_cols_to_drop) > 0:
    print(f"‚ö†Ô∏è Dropping non-numerical columns before training: {list(text_cols_to_drop)}")
    X = X.drop(columns=text_cols_to_drop)
else:
    print("‚úÖ No object-type columns found to drop.")


# --- 2. Create Train and Validation Splits for LOFO ---
# 80% for training the model, 20% for validating the feature importance.
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# --- 3. Model Setup ---
# Set up XGBoost with GPU support

def get_new_xgb_model():
    """Returns a new, un-fitted XGBoost model instance."""
    return xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=100,
        learning_rate=0.05,
        # üí° FIX: Replace tree_method='gpu_hist' and gpu_id=0 with device='cuda:0'
        # tree_method='gpu_hist', # Redundant/deprecated when using device
        # gpu_id=0,               # Deprecated, causes XGBoostError
        device='cuda:0',          # Use the first GPU
        n_jobs=-1
    )

model_baseline = get_new_xgb_model()

# --- 4. Baseline Calculation ---
print("\nTraining Baseline Model...")

# Fit the model with ALL features on the training split
model_baseline.fit(X_train, y_train)

# Predict and calculate Baseline MSE on the UNSEEN validation split
preds_baseline = model_baseline.predict(X_val)
baseline_rmse = np.sqrt(mean_squared_error(y_val, preds_baseline))
print(f"Baseline RMSE with all features (on validation set): {baseline_rmse:.6f}")

# --- 5. LOFO Feature Importance Calculation ---
print("\nStarting Leave-One-Feature-Out (LOFO) analysis...")
feature_importance = []

# Loop through each feature, dropping it one at a time
for feature in tqdm(X_train.columns, desc="LOFO Progress"):
    # 5.1 Prepare data with one feature removed
    X_train_temp = X_train.drop(columns=[feature])
    X_val_temp = X_val.drop(columns=[feature])

    # 5.2 Re-train a FRESH model without the feature
    model_lofo = get_new_xgb_model()
    model_lofo.fit(X_train_temp, y_train)

    # 5.3 Predict and Calculate RMSE on the UNSEEN validation set
    preds_temp = model_lofo.predict(X_val_temp)
    rmse_temp = np.sqrt(mean_squared_error(y_val, preds_temp))  # Calculate RMSE instead of MSE

    # 5.4 Calculate the difference (Positive difference means the feature was important)
    rmse_diff = rmse_temp - baseline_rmse  # Calculate RMSE difference instead of MSE difference

    feature_importance.append({
        'feature': feature,
        'rmse_when_dropped': rmse_temp,  # Store RMSE when dropped
        'rmse_difference': rmse_diff  # Store RMSE difference
    })

# --- 6. Results Display ---
df_lofo = pd.DataFrame(feature_importance)
df_lofo = df_lofo.sort_values(by='rmse_difference', ascending=False).reset_index(drop=True)

# Temporarily set options to ensure all 200 rows are displayed if they exist
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

print("\n--- LOFO Results ---")
print("Features are ranked by 'mse_difference': A POSITIVE value means removing the feature WORSENED the score (it was important).")
display(df_lofo.head(200))

# Also print the list of the top 200 feature names without truncation
feats = df_lofo['feature'].head(200).tolist()
print("\n--- Top 200 Feature Names (List) ---")
# Use pprint for clean list display
import pprint
pprint.pprint(feats)

# Save the full LOFO results for later use
df_lofo.to_csv('lofo_feature_importance.csv', index=False)
print("\nFull LOFO results saved to lofo_feature_importance.csv")

'''

## Multi-Seed CAT (Function)

In [None]:
'''
def run_catboost_multi_seed(
    df,
    seeds=range(5),
    n_splits=5,
    n_top=25,
    verbose=True
):
    """
    üêà CatBoost Multi-Seed CV Trainer (GPU-adaptive, Stratified, OOF-enabled)
    ------------------------------------------------------------------------
    - StratifiedKFold on discrete essay score bins (0.5‚Äì6.0)
    - Out-Of-Fold predictions for stacking
    - Auto GPU/CPU detection
    - Aggregates feature importances across seeds
    """
    from catboost import CatBoostRegressor, Pool
    import numpy as np, pandas as pd, gc, torch, time, warnings
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import mean_squared_error
    from IPython.display import display

    warnings.filterwarnings("ignore")
    start_time = time.time()

    # =========================================================
    # ‚öôÔ∏è Detect device
    # =========================================================
    device_type = "GPU" if torch.cuda.is_available() else "CPU"
    print(f"üíª Using {device_type}")

    # =========================================================
    # üßπ Data prep
    # =========================================================
    df = df.copy()
    y = df["score"].astype(float).values
    X = (
        df.drop(columns=["id", "score", "essay_text"], errors="ignore")
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    features = X.columns.tolist()
    n_samples, n_features = X.shape
    print(f"üìä Loaded: {n_samples:,} samples √ó {n_features:,} features\n")

    if n_features == 0:
        raise ValueError("‚ùå No valid features found for CatBoost training.")

    # Bin labels for stratified folds
    y_bins = (y * 2).astype(int)

    # =========================================================
    # üöÄ Multi-seed Stratified CV
    # =========================================================
    all_rmse, all_models, all_importances = [], [], []
    oof_preds = np.zeros(len(X))

    print(f"üöÄ Starting CatBoost CV: {len(seeds)} seeds √ó {n_splits}-folds (stratified)\n")

    for s_i, seed in enumerate(seeds, 1):
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        fold_rmse, fold_models = [], []
        print(f"üå± Seed {seed} ({s_i}/{len(seeds)})")

        for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y_bins), 1):
            fold_start = time.time()

            train_pool = Pool(X.iloc[tr_idx], label=y[tr_idx])
            val_pool   = Pool(X.iloc[va_idx], label=y[va_idx])

            # ‚úÖ Fixed: removed rsm & subsample (not GPU-supported for RMSE)
            model = CatBoostRegressor(
                task_type=device_type,
                loss_function="RMSE",
                learning_rate=0.01,
                depth=6,
                iterations=5000,
                l2_leaf_reg=3,
                random_seed=seed,
                early_stopping_rounds=100,
                verbose=0
            )

            model.fit(train_pool, eval_set=val_pool, verbose=200 if verbose else False)

            preds = model.predict(val_pool)
            oof_preds[va_idx] += preds / len(seeds)

            rmse = mean_squared_error(y[va_idx], preds, squared=False)
            fold_rmse.append(rmse)
            fold_models.append(model)

            print(f"   ‚úÖ Fold {fold}/{n_splits}: RMSE={rmse:.4f} | BestIter={model.get_best_iteration()} | ‚è± {(time.time()-fold_start):.1f}s")

        mean_rmse, std_rmse = np.mean(fold_rmse), np.std(fold_rmse)
        all_rmse.append(mean_rmse)
        all_models.extend(fold_models)

        imp_df = pd.DataFrame({
            "feature": features,
            "importance": model.get_feature_importance(),
            "seed": seed
        })
        all_importances.append(imp_df)
        gc.collect()

        print(f"üåæ Seed {seed} done ‚Üí RMSE={mean_rmse:.4f} ¬± {std_rmse:.4f}\n")

    # =========================================================
    # üßÆ Aggregate importances
    # =========================================================
    avg_imp = (
        pd.concat(all_importances)
        .groupby("feature", as_index=False)["importance"]
        .mean()
        .sort_values("importance", ascending=False)
        .reset_index(drop=True)
    )

    total_min = (time.time() - start_time) / 60
    mean_rmse, std_rmse = np.mean(all_rmse), np.std(all_rmse)
    print(f"üèÅ Completed {len(seeds)} seeds in {total_min:.2f} min")
    print(f"üìâ Overall CV RMSE: {mean_rmse:.4f} ¬± {std_rmse:.4f}\n")

    if verbose:
        print(f"üèÖ Top {n_top} Averaged Features:")
        display(avg_imp.head(n_top))

    return {
        "all_models": all_models,
        "oof_preds": oof_preds,
        "all_rmse": all_rmse,
        "mean_rmse": mean_rmse,
        "std_rmse": std_rmse,
        "feature_importance_avg": avg_imp,
        "features": features,
        "runtime_min": total_min,
        "device_used": device_type,
    }
'''

## Multi-Seed Training (Function)

In [None]:
'''
import time

def run_multiple_models(df_train, models="all", seeds=range(10), folds=5, top_n=25):
    """
    Run multiple machine learning models based on the given argument and display the results.

    Parameters:
    - df_train: The training dataframe.
    - models: The models to run. Options: "all", "lgb", "xgb", "cat".
    - seeds: The list of random seeds to use for model training.
    - folds: The number of folds for cross-validation.
    - top_n: The number of top features to consider.
    """
    
    models_dict = {"lgb": run_lightgbm_multi_seed,
                   "xgb": run_xgboost_multi_seed,
                   "cat": run_catboost_multi_seed}

    print("‚ö° Starting multi-model training...\n")
    start_all = time.time()
    
    # lgb Model
    if models == "all" or "lgb" in models:
        print("Training lgb...")
        res_lgb = run_lightgbm_multi_seed(df_train, seeds=seeds, n_splits=folds, n_top=top_n)
        print(f"üìâ lgb RMSE: {res_lgb['mean_rmse']:.4f}")
        
        # Feature importance for lgb
        imp_lgb = res_lgb['feature_importance_avg']
        top25_features_lgb = imp_lgb.head(top_n)['feature'].tolist()
        print(f"lgb Top {top_n} features:", top25_features_lgb)

    # XGB Model
    if models == "all" or "xgb" in models:
        print("Training XGB...")
        res_xgb = run_xgboost_multi_seed(df_train, seeds=seeds, n_splits=folds, n_top=top_n)
        print(f"üìâ XGB RMSE: {res_xgb['mean_rmse']:.4f}")
        
        # Feature importance for XGB
        imp_xgb = res_xgb['feature_importance_avg']
        top25_features_xgb  = imp_xgb.head(top_n)['feature'].tolist()
        print(f"XGB Top {top_n} features:", top25_features_xgb)

    # CAT Model
    if models == "all" or "cat" in models:
        print("Training CAT...")
        res_cat = run_catboost_multi_seed(df_train, seeds=seeds, n_splits=folds, n_top=top_n)
        print(f"üìâ CAT RMSE: {res_cat['mean_rmse']:.4f}")
        
        # Feature importance for CAT
        imp_cat = res_cat['feature_importance_avg']
        top25_features_cat   = imp_cat.head(top_n)['feature'].tolist()
        print(f"CAT Top {top_n} features:", top25_features_cat)

    print("\nüèÅ Training complete!")
    print(f"‚è± Total time: {(time.time() - start_all)/60:.2f} min")
'''

In [None]:
'''
df_train = df_train_full.copy()
run_multiple_models(df_train, models="cat", seeds=range(1), folds=3, top_n=25)
'''

## Extract Features

In [None]:
feats = ['paragraph_count',
 'inter_key_median_lantency',
 'feature 1048',
 'feature 1049',
 'feature 1120',
 'feature 448',
 'feature 449',
 'feature 846',
 'feature 847',
 'feature 65',
 'feature 138',
 'feature 141',
 'text_change_7_cnt',
 'sent_word_count_median',
 'feature 1023',
 'feature 1031',
 'feature 335',
 'feature 364',
 'feature 365',
 'feature 789',
 'feature 808',
 'feature 809',
 'R-bursts_median',
 'Topic_4',
 'cursor_position_max',
 'feature 0',
 'feature 1108',
 'feature 1109',
 'feature 1130',
 'feature 126',
 'feature 168',
 'feature 185',
 'feature 719',
 'feature 720',
 'feature 992',
 'feature 993',
 'paragraph_len_sum',
 'sent_len_sum',
 'sent_word_count_sum',
 'word_count_max',
 'word_count_std',
 'word_len_count',
 'word_len_sum',
 'action_time_max',
 'down_event_3_cnt',
 'up_event_3_cnt',
 'pauses_1_half_sec',
 'paragraph_word_count_q2',
 'product_to_keys',
 'paragraph_len_max',
 'text_change_6_cnt',
 'feature 1124',
 'feature 580',
 'feature 200',
 'feature 721',
 'feature 994',
 'feature 180',
 'feature 556',
 'mean_pause_time',
 'cursor_position_std',
 'word_len_q2',
 'sent_word_count_q2',
 'text_change_8_cnt',
 'feature 905',
 'feature 1115',
 'feature 913',
 'feature 288',
 'sent_word_count_q7',
 'feature 179',
 'feature 47',
 'feature 55',
 'feature 908',
 'text_change',
 'paragraph_word_count_q7',
 'down_event_12_cnt',
 'up_event_12_cnt',
 'feature 376',
 'feature 377',
 'feature 814',
 'feature 76',
 'feature 1075',
 'feature 1079',
 'feature 605',
 'feature 614',
 'feature 622',
 'feature 923',
 'feature 929',
 'feature 934',
 'action_time_std',
 'feature 480',
 'sent_word_count_first',
 'sent_len_q2',
 'feature 319',
 'text_change_12_cnt',
 'activity_1_cnt',
 'down_event_2_cnt',
 'up_event_2_cnt',
 'feature 38',
 'feature 43',
 'feature 46',
 'feature 9',
 'word_len_q7',
 'feature 336',
 'paragraph_len_first',
 'activity_3_cnt',
 'feature 1063',
 'feature 521',
 'feature 522',
 'feature 882',
 'feature 883',
 'feature 594',
 'feature 1035',
 'feature 1117',
 'feature 823',
 'feature 587',
 'sent_len_mean',
 'feature 893',
 'feature 572',
 'feature 1003',
 'feature 741',
 'text_change_11_cnt',
 'feature 1080',
 'feature 1081',
 'feature 629',
 'feature 630',
 'feature 935',
 'feature 936',
 'keys_per_second',
 'activity_0_cnt',
 'down_event_0_cnt',
 'text_change_0_cnt',
 'up_event_0_cnt',
 'down_event_10_cnt',
 'up_event_10_cnt',
 'feature 162',
 'feature 163',
 'feature 557',
 'feature 92',
 'feature 95',
 'feature 14',
 'feature 19',
 'feature 235',
 'feature 267',
 'feature 276',
 'feature 8',
 'feature 1002',
 'feature 1111',
 'feature 740',
 'word_len_median',
 'sent_len_median',
 'feature 252',
 'feature 253',
 'feature 593',
 'feature 722',
 'feature 995',
 'feature 544',
 'text_change_10_cnt',
 'feature 508',
 'feature 778',
 'paragraph_word_count_min',
 'feature 72',
 'feature 75',
 'R-bursts_std',
 'feature 384',
 'feature 817',
 'feature 388',
 'feature 897',
 'feature 539',
 'feature 1055',
 'feature 863',
 'feature 1066',
 'std_pause_time',
 'input_word_length_mean',
 'paragraph_len_min',
 'feature 389',
 'feature 400',
 'feature 418',
 'feature 894',
 'feature 79',
 'feature 1123',
 'feature 170',
 'feature 78',
 'Topic_2',
 'total_pause_time',
 'down_event_13_cnt',
 'text_change_5_cnt',
 'up_event_13_cnt',
 'pauses_3_sec',
 'word_len_max',
 'P-bursts_count',
 'feature 481',
 'sent_word_count_mean',
 'Topic_3',
 'feature 1020',
 'feature 1021',
 'feature 419',
 'feature 152',
 'feature 153',
 'feature 566',
 'feature 84',
 'feature 284',
 'input_word_count',
 'feature 410',
 'feature 60',
 'sent_len_first',
 'feature 181',
 'feature 417',
 'word_count_mean',
 'word_count_median',
 'word_count_quantile',
 'feature 383',
 'feature 918',
 'word_len_mean',
 'down_event_9_cnt',
 'up_event_9_cnt',
 'feature 173',
 'feature 1121',
 'P-bursts_last',
 'word_len_min',
 'feature 1061',
 'feature 1122',
 'feature 519',
 'feature 880',
 'feature 85',
 'feature 499',
 'feature 507',
 'feature 515',
 'paragraph_word_count_mean',
 'feature 898',
 'word_len_first',
 'input_word_length_max',
 'feature 178',
 'feature 1059',
 'feature 873',
 'feature 877',
 'paragraph_word_count_max',
 'feature 184',
 'text_change_9_cnt',
 'feature 174',
 'paragraph_len_median',
 'down_event_1_cnt',
 'text_change_1_cnt',
 'up_event_1_cnt',
 'feature 1036',
 'action_time_sum',
 'down_event_14_cnt',
 'up_event_14_cnt',
 'feature 1091',
 'feature 1092',
 'feature 1127',
 'feature 665',
 'feature 666',
 'feature 954',
 'feature 955',
 'input_word_length_std',
 'sent_len_q7',
 'down_event_8_cnt',
 'text_change_3_cnt',
 'up_event_8_cnt',
 'down_time_min',
 'up_time_min',
 'feature 598',
 'feature 503',
 'feature 504',
 'feature 876',
 'R-bursts_mean',
 'feature 1072',
 'paragraph_word_count_last',
 'activity_2_cnt',
 'down_time_std',
 'up_time_std',
 'feature 1024',
 'feature 790',
 'Topic_5',
 'paragraph_word_count_sum',
 'paragraph_word_count_first',
 'pauses_1_sec',
 'R-bursts_max',
 'inter_key_largest_lantency',
 'paragraph_len_q7',
 'pauses_2_sec',
 'paragraph_len_mean',
 'cursor_position_mean',
 'cursor_position_median',
 'cursor_position_quantile',
 'feature 1062',
 'feature 520',
 'feature 881',
 'down_event',
 'up_event',
 'down_event_11_cnt',
 'up_event_11_cnt',
 'paragraph_len_q2',
 'paragraph_word_count_median',
 'feature 127',
 'text_change_4_cnt',
 'paragraph_len_last',
 'down_event_4_cnt',
 'up_event_4_cnt',
 'feature 169',
 'Topic_0'] + ['_Topic_0','_Topic_1','_Topic_2','_Topic_3','_Topic_4','_Topic_5'] + [f'__Topic_{x}' for x in range(6)] 

In [None]:
top25_features_cat_saved = [
    'max_cursor',
    'final_word_count',
    'max_word_count',
    'num_words',
    'commas_per_sentence',
    'q_tc_count',
    'num_paragraphs',
    'std_cursor',
    'count_input',
    'spaces_before_punct_per_100_tokens',
    'commas_per_100_words',
    'any_internal_punct_share',
    'mean_cursor',
    'body_para_mean_len',
    'avg_internal_punct_per_sentence',
    'words_per_second',
    'dashes_count',
    'q_overall_delta',
    'time_per_word',
    'std_word_count',
    'multi_clause_sent_share',
    'spaces_before_comma',
    'double_spaces_after_eos',
    'mean_sentence_len',
    'deb_emb_424',
]

print(f"   CAT  ‚Üí {len(top25_features_cat_saved)} features")

In [None]:
print('< Mapping >')

# Select features and target (score)
x = df_train_combined_with_scores.drop(['id', 'score'], axis=1)[feats]  # Features (excluding 'id' and 'score')
y = df_train_combined_with_scores['score'].values  # Target

print(f'Number of features: {len(x.columns)}')

## Test
test_ids = df_test_combined['id'].values  # Assuming 'df_test_full' holds the test data
testin_x = df_test_combined.drop(['id'], axis=1)[feats]  # Select test features based on the defined 'feats'

## t-distributed Stchastic Neighbour Embedding (TSNE)

In [None]:
from sklearn.manifold import TSNE
#For Features Adding
df = pd.concat([x[feats],testin_x[feats]])

t_sne = TSNE(n_components=2, random_state=42, perplexity=20, n_jobs=-1, verbose=True)
df_tsne = t_sne.fit_transform(df.fillna(0))

plt.figure(figsize=(13,10))
plt.scatter(df_tsne[:, 0], df_tsne[:, 1], cmap="jet")
plt.colorbar()
plt.show()

x['tsne_0'] = df_tsne[:x.shape[0],0]
x['tsne_1'] = df_tsne[:x.shape[0],1]

testin_x['tsne_0'] = df_tsne[x.shape[0]:,0]
testin_x['tsne_1'] = df_tsne[x.shape[0]:,1]

In [None]:
from sklearn.manifold import TSNE
#For Features Adding
df = pd.concat([x[feats],testin_x[feats]])

t_sne = TSNE(n_components=2, random_state=42, perplexity=50, n_jobs=-1, verbose=True)
df_tsne = t_sne.fit_transform(df.fillna(0))

plt.figure(figsize=(13,10))
plt.scatter(df_tsne[:, 0], df_tsne[:, 1], cmap="jet")
plt.colorbar()
plt.show()

x['_tsne_0'] = df_tsne[:x.shape[0],0]
x['_tsne_1'] = df_tsne[:x.shape[0],1]

testin_x['_tsne_0'] = df_tsne[x.shape[0]:,0]
testin_x['_tsne_1'] = df_tsne[x.shape[0]:,1]

In [None]:
from sklearn.manifold import TSNE
# For Features Adding
df = pd.concat([x[feats],testin_x[feats]])

t_sne = TSNE(n_components=2, random_state=42, perplexity=80, n_jobs=-1, verbose=True)
df_tsne = t_sne.fit_transform(df.fillna(0))

plt.figure(figsize=(13,10))
plt.scatter(df_tsne[:, 0], df_tsne[:, 1], cmap="jet")
plt.colorbar()
plt.show()

x['__tsne_0'] = df_tsne[:x.shape[0],0]
x['__tsne_1'] = df_tsne[:x.shape[0],1]

testin_x['__tsne_0'] = df_tsne[x.shape[0]:,0]
testin_x['__tsne_1'] = df_tsne[x.shape[0]:,1]

## MultinomialNB

In [None]:
def train_valid_split(data_x, data_y, train_idx, valid_idx):
    x_train = data_x[train_idx]
    y_train = data_y[train_idx]
    x_valid = data_x[valid_idx]
    y_valid = data_y[valid_idx]
    return x_train, y_train, x_valid, y_valid


# Modify your evaluate function
def evaluate(data_x, data_y, model, n_splits=5, n_bags=1, test_x=None):
    cv_results = np.zeros((len(data_x), pd.Series(data_y).nunique()))
    test_results = np.zeros((len(test_x), pd.Series(data_y).nunique())) if test_x is not None else None

    for bag in range(n_bags):
        if n_bags == 1:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        else:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=bag)
        for i, (train_index, valid_index) in tqdm(enumerate(skf.split(data_x, data_y.astype(str)))):
            train_x, train_y, valid_x, valid_y = train_valid_split(data_x, data_y, train_index, valid_index)
            
            model = MultinomialNB(alpha=1.0)
            
            # First training on the original training set
            model.fit(train_x, train_y)

            # Store the predictions
            cv_predictions = model.predict_proba(valid_x)
            cv_results[valid_index, :] = cv_predictions

            # Predict on test set if available
            if test_x is not None:
                test_results += model.predict_proba(test_x) / n_splits

    if test_x is not None:
        return cv_results, test_results
    else:
        return cv_results, None

In [None]:
from xgboost import XGBRegressor, XGBClassifier
## from tabpfn import TabPFNClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier   
import scipy.stats as stats

vals = {4.0: 9,
 3.5: 8,
 4.5: 7,
 3.0: 6,
 2.5: 5,
 5.0: 4,
 5.5: 1,
 2.0: 3,
 1.5: 2,
 6.0: 1,
 1.0: 0,
 0.5: 0}

_y = pd.Series(y).map(vals)
_y

solution = MultinomialNB(alpha=1.0)
p_comp = 100
pca = PCA(n_components=p_comp, random_state=42)
df = pd.concat([x,testin_x])
pca.fit(df.fillna(0))
_x = pca.transform(x.fillna(0))**2
_testin_x = pca.transform(testin_x.fillna(0))**2

oof_prob_2, oof_prob_test_2 = evaluate(_x.copy(), _y.copy(), solution, n_bags=1, test_x=_testin_x.copy(), n_splits=5)
oof_prob_2

## MLP Classifer

In [None]:
def train_valid_split(data_x, data_y, train_idx, valid_idx):
    x_train = data_x[train_idx]
    y_train = data_y[train_idx]
    x_valid = data_x[valid_idx]
    y_valid = data_y[valid_idx]
    return x_train, y_train, x_valid, y_valid


# Modify your evaluate function
def evaluate(data_x, data_y, model, n_splits=5, n_bags=1, test_x=None):
    cv_results = np.zeros((len(data_x), pd.Series(data_y).nunique()))
    test_results = np.zeros((len(test_x), pd.Series(data_y).nunique())) if test_x is not None else None

    for bag in range(n_bags):
        if n_bags == 1:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        else:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=bag)
        for i, (train_index, valid_index) in tqdm(enumerate(skf.split(data_x, data_y.astype(str)))):
            train_x, train_y, valid_x, valid_y = train_valid_split(data_x, data_y, train_index, valid_index)
            
            model = MLPClassifier(random_state=42)
            
            # First training on the original training set
            model.fit(train_x, train_y)

            # Store the predictions
            cv_predictions = model.predict_proba(valid_x)
            cv_results[valid_index, :] = cv_predictions

            # Predict on test set if available
            if test_x is not None:
                test_results += model.predict_proba(test_x) / n_splits

    if test_x is not None:
        return cv_results, test_results
    else:
        return cv_results, None

In [None]:
from xgboost import XGBRegressor, XGBClassifier
## from tabpfn import TabPFNClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier   
import scipy.stats as stats

vals = {4.0: 9,
 3.5: 8,
 4.5: 7,
 3.0: 6,
 2.5: 5,
 5.0: 4,
 5.5: 1,
 2.0: 3,
 1.5: 2,
 6.0: 1,
 1.0: 0,
 0.5: 0}

_y = pd.Series(y).map(vals)
_y

solution = MLPClassifier(random_state=42)
p_comp = 100
pca = PCA(n_components=p_comp, random_state=42)
df = pd.concat([x,testin_x])
pca.fit(df.fillna(0))
_x = pca.transform(x.fillna(0))**2
_testin_x = pca.transform(testin_x.fillna(0))**2

oof_prob_3, oof_prob_test_3 = evaluate(_x.copy(), _y.copy(), solution, n_bags=1, test_x=_testin_x.copy(), n_splits=5)
oof_prob_3

## Weighted Sum

In [None]:
_x = x.copy()
_testin_x = testin_x.copy()
## _x[[f'probs_tab_{i}' for i in range(10)]] = oof_prob_1.copy()
## _testin_x[[f'probs_tab_{i}' for i in range(10)]] = oof_prob_test_1.copy()
_x[[f'probs_nb_{i}' for i in range(10)]] = oof_prob_2.copy()
_testin_x[[f'probs_nb_{i}' for i in range(10)]] = oof_prob_test_2.copy()
_x[[f'probs_nn_{i}' for i in range(10)]] = oof_prob_3.copy()
_testin_x[[f'probs_nn_{i}' for i in range(10)]] = oof_prob_test_3.copy()

In [None]:
vals = {4.0: 9, 3.5: 8, 4.5: 7, 3.0: 6, 2.5: 5, 5.0: 4, 5.5: 1, 2.0: 3, 1.5: 2, 6.0: 1, 1.0: 0, 0.5: 0}
Inversemapper = {0:1, 1:6, 2: 1.5, 3:2, 4:5, 5: 2.5, 6:3, 7:4.5, 8:3.5, 9:4}

## new feat
def f(x):
    s = 0
    for i, y in enumerate(x):
        s += y*Inversemapper[i]
    return s
for str_model in ['probs_nb_','probs_nn_']:  # Remove 'probs_tab_' if you no longer use it
    _x[f'{str_model}w_sum'] = _x[[f'{str_model}{i}' for i in range(10)]].apply(f, axis=1)
    _testin_x[f'{str_model}w_sum'] = _testin_x[[f'{str_model}{i}' for i in range(10)]].apply(f, axis=1)

## Run XGBoost

In [None]:

def train_valid_split(data_x, data_y, train_idx, valid_idx):
    x_train = data_x.iloc[train_idx]
    y_train = data_y[train_idx]
    x_valid = data_x.iloc[valid_idx]
    y_valid = data_y[valid_idx]
    return x_train, y_train, x_valid, y_valid


# Modify your evaluate function
def evaluate(data_x, data_y, model, n_splits=5, n_bags=1, test_x=None):
    cv_results = np.zeros((len(data_x), n_bags))
    test_results = np.zeros((len(test_x), n_bags)) if test_x is not None else None

    for bag in range(n_bags):
        if n_bags == 1:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        else:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=bag)
        for i, (train_index, valid_index) in tqdm(enumerate(skf.split(data_x, data_y.astype(str)))):
            train_x, train_y, valid_x, valid_y = train_valid_split(data_x, data_y, train_index, valid_index)
            
                        
            # First training on the original training set
            model.fit(train_x, train_y)

            # Store the predictions
            cv_predictions = model.predict(valid_x)
            cv_results[valid_index, bag] = cv_predictions

            # Predict on test set if available
            if test_x is not None:
                test_results[:, bag] += model.predict(test_x) / n_splits

    if test_x is not None:
        return np.mean(cv_results, axis=1), np.mean(test_results, axis=1)
    else:
        return np.mean(cv_results, axis=1), None

from xgboost import XGBRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor
from sklearn.svm import SVR
import scipy.stats as stats

param = {'n_estimators': 1000,
         'learning_rate': 0.01,
         'max_depth': 4,
         'subsample':0.3,
         'objective': 'reg:squarederror',
         'random_state': 42}

solution = XGBRegressor(**param)
oof_3, test_results_xg_2 = evaluate(_x.copy(), y.copy(), solution, n_bags=1, test_x=_testin_x.copy(), n_splits=5)
print('XG CV RMSE: ',np.sqrt(mean_squared_error(y,oof_3)))
spearman_correlation, p_value = stats.spearmanr(y, oof_3)
print('Spearman Correlation Coefficient: ', spearman_correlation)

# XG CV RMSE:  0.5876518182062751
# Spearman Correlation Coefficient:  0.822345159257767


## Output Best CatBoost Parameters (Run Once)

In [None]:
## !pip install optuna

In [None]:
'''
import joblib, gc
import os
import numpy as np, pandas as pd
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import torch
import warnings

# ============================================================
# üö´ SUPPRESS ALL WARNINGS (global and library-specific)
# ============================================================
warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"
os.environ["XGBOOST_VERBOSITY"] = "0"
os.environ["CATBOOST_LOGGING_LEVEL"] = "Silent"

print("üöÄ Retraining final (top-N) models on full training data (all warnings suppressed)...")

df_train = df_train_full.copy()

# --- Helper to restrict columns safely
def select_features(df_train, feats):
    return df_train.loc[:, [f for f in feats if f in df_train.columns]]

# --- Common clean-up
X_full = df_train.drop(columns=['id', 'score', 'essay_text'], errors='ignore')
X_full = X_full.replace([np.inf, -np.inf], np.nan).fillna(0)
y_full = df_train['score'].values

# ============================================================
# Objective Function for CatBoost
# ============================================================
def objective_cb(trial):
    print(f"\nüöÄ Running trial {trial.number} for CatBoost...")

    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 0.1) if trial.number > 0 else 0.01
    depth = trial.suggest_int('depth', 4, 12) if trial.number > 0 else 6
    iterations = trial.suggest_int('iterations', 1000, 5000) if trial.number > 0 else 3000
    l2_leaf_reg = trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10.0) if trial.number > 0 else 0.1
    bagging_temperature = trial.suggest_uniform('bagging_temperature', 0, 1.0) if trial.number > 0 else 0.2

    X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

    model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        l2_leaf_reg=l2_leaf_reg,
        bagging_temperature=bagging_temperature,
        loss_function='RMSE',
        random_seed=42,
        verbose=False,
        task_type='GPU' if torch.cuda.is_available() else 'CPU'
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f"Trial {trial.number}: RMSE = {rmse:.4f}")
    return rmse

# ============================================================
# Perform Optuna Hyperparameter Optimization for All Models
# ============================================================

study_cb = optuna.create_study(direction='minimize')
study_cb.optimize(objective_cb, n_trials=10)
print(f"Best CatBoost Params: {study_cb.best_params}")
print(f"Best RMSE: {study_cb.best_value}")
'''

## Save Model Parameters

In [None]:
# ============================================================
# Save best hyperparameters for each model
# ============================================================

# ============================================================
# CatBoost Best Parameters
# ============================================================
best_params_cat_saved = {
    'learning_rate': 0.0049453015631225694,
    'depth': 5,
    'iterations': 1227,
    'l2_leaf_reg': 3.9931762727996736e-05,
    'bagging_temperature': 0.123929117769213
}

# Print out the saved parameters to check
print(f"CatBoost Best Params: {best_params_cat_saved}")

## Kfold Train

In [None]:

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# ---------------------------------------------------------
# Step 1: Load the full training dataset and prepare data
# ---------------------------------------------------------
df_train = df_train_combined_with_scores_new.copy()

# Prepare features and labels
X_full = df_train.drop(columns=['id', 'score', 'essay_text'], errors='ignore')  # Use all features (excluding id, score, essay_text)
X_full = X_full.replace([np.inf, -np.inf], np.nan).fillna(0)  # Handle missing values
y_full = df_train['score'].values  # Target variable

# ---------------------------------------------------------
# Step 2: Use the top 25 features for CatBoost
# ---------------------------------------------------------
top25_features_cat_saved = [
    'max_cursor',
    'final_word_count',
    'max_word_count',
    'num_words',
    'commas_per_sentence',
    'q_tc_count',
    'num_paragraphs',
    'std_cursor',
    'count_input',
    'spaces_before_punct_per_100_tokens',
    'commas_per_100_words',
    'any_internal_punct_share',
    'mean_cursor',
    'body_para_mean_len',
    'avg_internal_punct_per_sentence',
    'words_per_second',
    'dashes_count',
    'q_overall_delta',
    'time_per_word',
    'std_word_count',
    'multi_clause_sent_share',
    'spaces_before_comma',
    'double_spaces_after_eos',
    'mean_sentence_len',
    'deb_emb_424',
]

# Use only the top 25 features for training
X_full_top25 = X_full[top25_features_cat_saved]

# ---------------------------------------------------------
# Step 3: Define the best parameters for CatBoost (already saved)
# ---------------------------------------------------------
best_params_cat = {
    'iterations': 1227,
    'depth': 5,
    'learning_rate': 0.0049453015631225694,
    'l2_leaf_reg': 3.9931762727996736e-05,
    'bagging_temperature': 0.123929117769213,
    'loss_function': 'RMSE',
    'random_seed': 42,
    'task_type': 'GPU',  # Use GPU for faster training if available
    'verbose': 100,
    'early_stopping_rounds': 100
}

# ---------------------------------------------------------
# Step 4: KFold cross-validation to get OOF predictions
# ---------------------------------------------------------
oof_preds_cat = np.zeros(len(X_full_top25))  # Initialize OOF predictions array
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation

for train_idx, val_idx in kf.split(X_full_top25):
    X_train, X_val = X_full_top25.iloc[train_idx], X_full_top25.iloc[val_idx]
    y_train, y_val = y_full[train_idx], y_full[val_idx]

    # Train the CatBoost model with the current fold's training data
    cat_model = CatBoostRegressor(
        **best_params_cat  # Use best saved parameters
    )
    
    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val))  # Training with eval_set for validation

    # Make predictions on the validation set and store them in the OOF array
    oof_preds_cat[val_idx] = cat_model.predict(X_val)

# ---------------------------------------------------------
# Step 5: Evaluate the CatBoost model with OOF predictions
# ---------------------------------------------------------
cat_rmse = np.sqrt(mean_squared_error(y_full, oof_preds_cat))  # RMSE for the model using OOF predictions
print(f"CatBoost RMSE (OOF): {cat_rmse:.4f}")

# ---------------------------------------------------------
# Step 6: Test Data Preparation and Prediction
# ---------------------------------------------------------
df_test = df_test_combined_new.copy()  # Assuming the test set is available

# Prepare test features
X_test = df_test.drop(columns=['id', 'essay_text'], errors='ignore')
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)  # Handle missing values in the test set

# Use only the top 25 features for the test set
X_test_top25 = X_test[top25_features_cat_saved]

# Get predictions from the CatBoost model on the test data
cat_test_preds = cat_model.predict(X_test_top25)

'''
# ---------------------------------------------------------
# Step 7: Prepare the Submission
# ---------------------------------------------------------
submission = pd.DataFrame({
    'id': df_test['id'],  # Use the 'id' from the test data
    'score': np.clip(cat_test_preds, 0.5, 6.0)  # Ensure the scores are clipped to a valid range [0.5, 6.0]
})

# Save the submission to a CSV file
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission file generated.")
'''

## Ensemble

In [None]:
from catboost import CatBoostRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, Ridge, PassiveAggressiveRegressor, HuberRegressor, PoissonRegressor, BayesianRidge
from sklearn.preprocessing import MinMaxScaler

from scipy.optimize import minimize
from sklearn.metrics import mean_squared_error

# =======================================================
# ‚úîÔ∏è 1) Two-model OOF matrices: XGB + CatBoost
# =======================================================

subs = np.vstack([
    oof_3,          # XGBoost OOF predictions
    oof_preds_cat   # CatBoost OOF predictions
])  # shape: (2, N)

subs = subs.T  # shape becomes (N, 2)

print("Shape of subs:", subs.shape)

# =======================================================
# ‚úîÔ∏è 2) Optimize 2 weights
# =======================================================

def weighted_rmse(weights, subs, y_true):
    ensemble_pred = np.dot(subs, weights)
    return np.sqrt(mean_squared_error(y_true, ensemble_pred))

initial_weights = [0.5, 0.5]

constraints = ({'type': 'eq', 'fun': lambda w: sum(w) - 1})
bounds = [(0, 1), (0, 1)]

opt_result = minimize(
    weighted_rmse,
    initial_weights,
    args=(subs, y),
    method='SLSQP',
    bounds=bounds,
    constraints=constraints
)

optimal_weights = opt_result.x

print("Optimal weights:", optimal_weights)
print("Optimized RMSE:", weighted_rmse(optimal_weights, subs, y))


# =======================================================
# ‚úîÔ∏è 3) Apply ensemble to TEST predictions
# =======================================================

# XGBoost test predictions
test_xgb = test_results_xg_2

# CatBoost test predictions
test_cat = cat_test_preds

ensemble_test = (
    test_xgb * optimal_weights[0] +
    test_cat * optimal_weights[1]
)

# =======================================================
# ‚úîÔ∏è 4) Prepare submission
# =======================================================

ensemble_sub = pd.DataFrame({
    'id': test_ids,
    'score': ensemble_test
})

ensemble_sub

ensemble_sub.to_csv('/kaggle/working/submission.csv', index=False)