In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deberta-v3-fast-tokenizer/deb-v3/spm.model
/kaggle/input/deberta-v3-fast-tokenizer/deb-v3/config.json
/kaggle/input/deberta-v3-fast-tokenizer/deb-v3/tokenizer.json
/kaggle/input/deberta-v3-fast-tokenizer/deb-v3/tokenizer_config.json
/kaggle/input/deberta-v3-fast-tokenizer/deb-v3/pytorch_model.bin
/kaggle/input/deberta-v3-fast-tokenizer/deb-v3/special_tokens_map.json
/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv
/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv


# 1 Importing Data from Kaggle

In [2]:
# Define paths to CSV files
TRAIN_LOGS   = "/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv"
TRAIN_SCORES = "/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv"
TEST_LOGS    = "/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv"
SAMPLE_SUB   = "/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv"

df_train_logs = pd.read_csv(TRAIN_LOGS)
df_train_scores = pd.read_csv(TRAIN_SCORES)
df_test_logs = pd.read_csv(TEST_LOGS)
df_sample_submission = pd.read_csv(SAMPLE_SUB)

print("Train logs:", TRAIN_LOGS)
print("Train scores:", TRAIN_SCORES)
print("Test logs:", TEST_LOGS)
print("Sample submission:", SAMPLE_SUB)

Train logs: /kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv
Train scores: /kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv
Test logs: /kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv
Sample submission: /kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv


# 2 Data Analysis and Manipulation

## 2.1 Overview

In [3]:
# Create dataframes of the given data
df_train_logs = pd.read_csv(TRAIN_LOGS)
df_train_scores = pd.read_csv(TRAIN_SCORES)
df_test_logs = pd.read_csv(TEST_LOGS)
df_sample_submission = pd.read_csv(SAMPLE_SUB)

Key Observations:
1. Activity column: Two categories of outputs, one is a movement, the other is a particular acttion
2. Text_Change column: Values of interest include ' ' (space) as well as entries containing 'q's

## 2.2 Analysis (Function)

In [4]:
def analyse_data(df_orig):
    """
    Analyzes a dataframe for data quality and structure.
    Prints compact, useful diagnostics ‚Äî avoids unnecessary verbosity.
    """
    df = df_orig.copy()
    print("üìä ANALYSING DATAFRAME\n")

    # 1Ô∏è‚É£ Missing values summary
    na_counts = df.isna().sum()
    total_missing = na_counts.sum()
    if total_missing > 0:
        print(f"üî∏ Missing values detected in {sum(na_counts > 0)} / {len(df.columns)} columns")
        print(na_counts[na_counts > 0].sort_values(ascending=False))
    else:
        print("‚úÖ No missing values found.")
    
    # 2Ô∏è‚É£ Data type consistency check
    print("\nüß© Checking for inconsistent data types...")
    inconsistent_cols = []
    for column in df.columns:
        types = df[column].apply(type)
        majority_type = types.mode()[0]
        anomaly_mask = types != majority_type
        if anomaly_mask.any():
            inconsistent_cols.append(column)
            num_anomalies = anomaly_mask.sum()
            print(f"‚ö†Ô∏è  {column}: {num_anomalies} anomalous entries (expected {majority_type.__name__})")
    if not inconsistent_cols:
        print("‚úÖ All columns have consistent data types.")

    # 3Ô∏è‚É£ Negative numeric values
    numeric_cols = df.select_dtypes(include=["number"])
    neg_mask = (numeric_cols < 0).any()
    neg_cols = neg_mask[neg_mask].index.tolist()
    if neg_cols:
        print(f"\n‚ö†Ô∏è Columns with negative values ({len(neg_cols)}): {neg_cols}")
    else:
        print("\n‚úÖ No negative values in numeric columns.")

    # 4Ô∏è‚É£ Distinct value counts
    nunique = df.nunique()
    print("\nüì¶ Distinct values summary:")
    print(nunique.describe()[['min', 'max']])
    # Only show top 10 most unique columns
    top_unique = nunique.sort_values(ascending=False).head(10)
    print("üîπ Top 10 columns by unique count:")
    print(top_unique)

    # 5Ô∏è‚É£ Sample string columns (only small samples)
    obj_cols = df.select_dtypes(include=["object"]).columns
    if len(obj_cols) > 0:
        print("\nüìù Sample entries from text columns:")
        for col in obj_cols:
            unique_vals = df[col].dropna().unique()
            sample_count = min(len(unique_vals), 5)
            print(f"‚Ä¢ {col}: {unique_vals[:sample_count]}")
    else:
        print("\n‚úÖ No object/string columns found.")

    # ‚úÖ Final summary
    print("\nüìã Summary:")
    print(f"Rows: {df.shape[0]} | Columns: {df.shape[1]}")
    print("Analysis complete.\n")

## 2.3 Transformation (Function)

In [5]:
def transform_data(df_orig):
    """
    Applies transformation steps to activity, event, and text_change columns.
    Prints only one example entry per stage for verification.
    """
    import re
    import numpy as np
    import pandas as pd

    df = df_orig.copy()
    print("üîß Transforming dataset...")

    # ==========================================================
    # 1Ô∏è‚É£ Transform 'activity' column
    # ==========================================================
    def calculate_move_distance(activity):
        move_pattern = r'Move From \[(-?\d+), (-?\d+)\] To \[(-?\d+), (-?\d+)\]'
        match = re.match(move_pattern, activity)
        if match:
            x1, y1, x2, y2 = map(int, match.groups())
            distance = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
            return f"move_{int(round(distance))}"
        else:
            return activity

    df["activity_trf"] = df["activity"].apply(calculate_move_distance)
    df["activity_trf"] = df["activity_trf"].replace({"Remove/Cut": "Cut"})

    print("\n‚úÖ Sample transformed 'activity_trf':")
    display(df[["activity", "activity_trf"]].head(1))

    # ==========================================================
    # 2Ô∏è‚É£ Transform 'down_event' and 'up_event' columns
    # ==========================================================
    def transform_event(event):
        event_str = str(event)
        if len(event_str) == 1 and event_str.isalnum():
            return "q"
        return event

    df["down_event_trf"] = df["down_event"].apply(transform_event)
    df["up_event_trf"] = df["up_event"].apply(transform_event)

    print("‚úÖ Sample transformed 'down_event' & 'up_event':")
    display(df[["down_event", "down_event_trf", "up_event", "up_event_trf"]].head(1))

    # Quick distinct summary (short)
    distinct_counts = df[["down_event_trf", "up_event_trf"]].nunique()
    print("Distinct transformed event types:")
    print(distinct_counts.to_dict())

    # ==========================================================
    # 3Ô∏è‚É£ Transform 'text_change' column
    # ==========================================================
    def parse_text_change(val):
        val = str(val).replace(" ", "space")
        if "q" not in val:
            return val
        elif "=>" in val:
            before, after = val.split("=>", 1)
            delta = len(after.strip()) - len(before.strip())
            if delta > 0:
                return f"q_add_{delta}"
            elif delta < 0:
                return f"q_subtract_{abs(delta)}"
            else:
                return "q_0"
        else:
            delta = len(val.strip())
            return f"q_add_{delta}" if delta > 0 else "q_0"

    df["text_change_trf"] = df["text_change"].apply(parse_text_change)

    print("\n‚úÖ Sample transformed 'text_change_trf':")
    display(df[["text_change", "text_change_trf"]].head(1))

    # ==========================================================
    # ‚úÖ Final Summary
    # ==========================================================
    print("\nüìã Transformation complete!")
    print(f"Rows: {df.shape[0]} | Columns: {df.shape[1]}")
    print(f"New columns added: activity_trf, down_event_trf, up_event_trf, text_change_trf\n")

    return df

## 2.4 Clean (Function)

In [6]:
def clean_data(df_orig):
    """
    Cleans string-type columns in a DataFrame:
      - Converts text to lowercase
      - Strips leading/trailing spaces
    Prints one example row for verification after cleaning.
    """
    import pandas as pd

    df = df_orig.copy()
    print("üßπ Cleaning data...")

    obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
    if not obj_cols:
        print("‚úÖ No object-type columns found ‚Äî nothing to clean.")
        return df

    # Apply transformations
    for col in obj_cols:
        df[col] = df[col].astype(str).str.lower().str.strip()

    # Show one sample row to confirm cleaning
    print(f"‚úÖ Cleaned {len(obj_cols)} text columns.")
    print("üìã Sample after cleaning:")
    display(df[obj_cols].head(1))

    print(f"Rows: {df.shape[0]} | Columns: {df.shape[1]}\n")
    return df

## 2.5 Aggregation (Function)

In [7]:
def aggregate_data(df_orig):
    """
    Aggregate raw writing process logs into essay-level behavioral features.
    Prints concise progress info and shows one sample entry after each major block.
    """
    import numpy as np
    import pandas as pd

    print("üßÆ Aggregating essay-level behavioral features...")

    # ==========================================================
    # 1Ô∏è‚É£ SORT & GROUP
    # ==========================================================
    df = df_orig.copy()
    df_sorted = df.sort_values(by=["id", "event_id"]).reset_index(drop=True)
    g = df_sorted.groupby("id")
    out = pd.DataFrame()

    print("‚úÖ Data sorted and grouped by 'id'.")
    print(f"Rows: {df_sorted.shape[0]} | Columns: {df_sorted.shape[1]}")

    # ==========================================================
    # 2Ô∏è‚É£ BASIC EVENT FEATURES
    # ==========================================================
    out["total_events"] = g["event_id"].count()
    out["writing_start"] = g["down_time"].min()
    out["writing_end"] = g["up_time"].max()
    out["total_time_spent_on_essay"] = out["writing_end"] - out["writing_start"]
    out["mean_action_time"] = g["action_time"].mean()
    out["sum_action_time"] = g["action_time"].sum()

    print("üïí Computed basic timing and event features.")
    display(out.head(1))

    # ==========================================================
    # 3Ô∏è‚É£ ACTIVITY TRANSFORM (MOVE VS NON-MOVE)
    # ==========================================================
    move_condition = df_sorted['activity_trf'].str.contains("move", case=False, na=False)
    out["non_move_count"] = (~move_condition).groupby(df_sorted['id']).sum()
    out["move_count"] = move_condition.groupby(df_sorted['id']).sum()

    non_move_activity_counts = df_sorted.loc[~move_condition, 'activity_trf'].unique()
    for activity in non_move_activity_counts:
        out[f"count_{activity}"] = g["activity_trf"].apply(lambda x: (x == activity).sum())

    print(f"üß≠ Added move/non-move activity stats ({len(non_move_activity_counts)} activity types).")
    display(out.head(1))

    # ==========================================================
    # 4Ô∏è‚É£ MOVE DISTANCE STATS
    # ==========================================================
    df_sorted["move_distance"] = (
        df_sorted["activity_trf"].str.extract(r"move_(\d+)").astype(float)
    )
    move_distance_stats = g["move_distance"].agg(
        sum_move_distance="sum",
        mean_move_distance="mean"
    )
    out = out.merge(move_distance_stats, on="id", how="left")
    out.loc[out["move_count"] == 0, "mean_move_distance"] = 0
    out.fillna({"sum_move_distance": 0, "mean_move_distance": 0}, inplace=True)

    print("üìè Computed move distance statistics.")
    display(out.head(1))

    # ==========================================================
    # 5Ô∏è‚É£ TEXT CHANGE DYNAMICS
    # ==========================================================
    out["non_q_tc_count"] = g["text_change_trf"].apply(lambda s: (~s.str.startswith("q")).sum())
    out["q_tc_count"] = g["text_change_trf"].apply(lambda s: s.str.startswith("q").sum())

    def parse_q_change(val):
        if isinstance(val, str):
            if val.startswith("q_add_"):
                return int(val.split("_")[-1])
            elif val.startswith("q_subtract_"):
                return -int(val.split("_")[-1])
        return 0

    df_sorted["q_delta"] = df_sorted["text_change_trf"].apply(parse_q_change)
    out["q_overall_delta"] = g["q_delta"].sum()

    print("‚úèÔ∏è Extracted text-change and q-delta features.")
    display(out.head(1))

    # ==========================================================
    # 6Ô∏è‚É£ CURSOR + WORD COUNT STATS
    # ==========================================================
    out["mean_cursor"] = g["cursor_position"].mean()
    out["std_cursor"] = g["cursor_position"].std()
    out["max_cursor"] = g["cursor_position"].max()

    wc_first = g["word_count"].first()
    wc_last = g["word_count"].last()
    out["final_word_count"] = wc_last
    out["max_word_count"] = g["word_count"].max()
    out["min_word_count"] = g["word_count"].min()
    out["std_word_count"] = g["word_count"].std()

    print("üñ±Ô∏è Added cursor and word count stats.")
    display(out.head(1))

    # ==========================================================
    # 7Ô∏è‚É£ DERIVED BEHAVIORAL RATIOS
    # ==========================================================
    out["words_per_event"] = out["final_word_count"] / out["total_events"].clip(lower=1)
    out["words_per_second"] = out["final_word_count"] / out["total_time_spent_on_essay"].clip(lower=1)

    out["edit_intensity"] = (
        out.get("count_cut", 0) + out.get("count_replace", 0) + out.get("count_nonproduction", 0)
    ) / out["total_events"].clip(lower=1)

    out["revision_ratio"] = (
        out.get("count_cut", 0) + out.get("count_replace", 0)
    ) / (out.get("count_input", 1) + 1)

    out["net_char_change_ratio"] = out["q_overall_delta"] / out["final_word_count"].clip(lower=1)
    out["q_activity_ratio"] = (
        (out.get("q_tc_count", 0) + out.get("non_q_tc_count", 0)) / out["total_events"].clip(lower=1)
    )

    out["cursor_movement_intensity"] = out["sum_move_distance"] / out["total_events"].clip(lower=1)
    out["avg_move_distance"] = out.get("mean_move_distance", 0)
    out["word_var_ratio"] = out["std_word_count"] / out["final_word_count"].clip(lower=1)
    out["time_per_word"] = out["total_time_spent_on_essay"] / out["final_word_count"].clip(lower=1)
    out["time_per_event"] = out["total_time_spent_on_essay"] / out["total_events"].clip(lower=1)

    out.replace([np.inf, -np.inf], 0, inplace=True)
    out.fillna(0, inplace=True)

    # Ensure ID is a column, not index
    if out.index.name == "id":
        out.reset_index(inplace=True)

    print("‚öôÔ∏è Derived higher-level behavioral ratios.")
    display(out.head(1))

    # ==========================================================
    # ‚úÖ SUMMARY
    # ==========================================================
    print("\n‚úÖ Aggregation complete!")
    print(f"Final shape: {out.shape[0]} rows √ó {out.shape[1]} columns")

    return out

## 2.6 Merge (Function)

In [8]:
# Overview of train_scores
display(df_train_scores.head())
display(df_train_scores.shape)
display(df_train_scores.dtypes)
# Temporarily set the float format for this block of code
with pd.option_context('display.float_format', lambda x: '%.3f' % x):
    display(df_train_scores.describe())  # This will print with the modified format only for this cell

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


(2471, 2)

id        object
score    float64
dtype: object

Unnamed: 0,score
count,2471.0
mean,3.711
std,1.025
min,0.5
25%,3.0
50%,4.0
75%,4.5
max,6.0


In [9]:
def merge_data(df_train_logs, df_train_scores):
    """
    Merge aggregated essay-level logs with their corresponding essay scores.

    Parameters:
    - df_train_logs: DataFrame containing aggregated essay-level features
    - df_train_scores: DataFrame containing essay IDs and their scores

    Returns:
    - df_train_logs_merged: Merged DataFrame with all features + score column
    """
    import pandas as pd

    if "id" not in df_train_logs.columns:
        df_train_logs = df_train_logs.reset_index()
    
    print("üîó Merging aggregated logs with essay scores...")

    # Defensive copies to avoid accidental mutation
    logs = df_train_logs.copy()
    scores = df_train_scores.copy()

    # Ensure both have the 'id' column
    if "id" not in logs.columns or "id" not in scores.columns:
        raise KeyError("‚ùå Both DataFrames must contain an 'id' column for merging.")

    # Perform merge
    df_train_logs_merged = pd.merge(
        logs,
        scores,
        on="id",
        how="left"
    )

    # Compact verification
    print(f"‚úÖ Merge complete! Final shape: {df_train_logs_merged.shape}")
    print("üìã Sample merged row:")
    display(df_train_logs_merged.head(1))

    # Optional warning for missing scores
    missing_scores = df_train_logs_merged["score"].isna().sum() if "score" in df_train_logs_merged.columns else 0
    if missing_scores > 0:
        print(f"‚ö†Ô∏è Warning: {missing_scores} essays have missing scores after merge.")

    return df_train_logs_merged

## 2.7 Output df_train_agg_logs

In [10]:
# ==========================================================
#  STEP 1 ‚Üí ANALYSE RAW DATA
# ==========================================================
print("Step 1: Analysing data...")
df_train_logs_analysis = df_train_logs.copy()
analyse_data(df_train_logs_analysis)
print(f"‚Üí Shape after Step 1: {df_train_logs_analysis.shape}")

# ==========================================================
#  STEP 2 ‚Üí TRANSFORM DATA
# ==========================================================
print("\nStep 2: Transforming columns...")
df_train_logs_transformed = transform_data(df_train_logs_analysis.copy())
print(f"‚Üí Shape after Step 2: {df_train_logs_transformed.shape}")

# ==========================================================
#  STEP 3 ‚Üí CLEAN DATA
# ==========================================================
print("\nStep 3: Cleaning data...")
df_train_logs_cleaned = clean_data(df_train_logs_transformed.copy())
print(f"‚Üí Shape after Step 3: {df_train_logs_cleaned.shape}")

# ==========================================================
#  STEP 4 ‚Üí AGGREGATE EVENT-LEVEL FEATURES (Essay-Level)
# ==========================================================
print("\nStep 4: Aggregating event-level features...")
df_train_logs_essaylevel = aggregate_data(df_train_logs_cleaned.copy())
print(f"‚Üí Shape after Step 4: {df_train_logs_essaylevel.shape}")

# ==========================================================
#  STEP 5 ‚Üí MERGE WITH SCORES (Final Aggregated Dataset)
# ==========================================================
print("\nStep 5: Merging with scores...")
df_train_agg_logs = merge_data(df_train_logs_essaylevel.copy(), df_train_scores.copy())
print(f"‚Üí Shape after Step 5: {df_train_agg_logs.shape}")

# ==========================================================
#  FINAL SUMMARY
# ==========================================================
print("\n‚úÖ Preprocessing pipeline completed successfully!")
print(f"Final dataset shape: {df_train_agg_logs.shape}")
display(df_train_agg_logs.head(3))

Step 1: Analysing data...
üìä ANALYSING DATAFRAME

‚úÖ No missing values found.

üß© Checking for inconsistent data types...
‚úÖ All columns have consistent data types.

‚úÖ No negative values in numeric columns.

üì¶ Distinct values summary:
min         50.0
max    1836078.0
dtype: float64
üîπ Top 10 columns by unique count:
down_time          1836078
up_time            1835993
event_id             12876
cursor_position       7803
text_change           4111
action_time           3509
id                    2471
word_count            1327
down_event             131
up_event               130
dtype: int64

üìù Sample entries from text columns:
‚Ä¢ id: ['001519c8' '0022f953' '0042269b' '0059420b' '0075873a']
‚Ä¢ activity: ['Nonproduction' 'Input' 'Remove/Cut' 'Replace'
 'Move From [284, 292] To [282, 290]']
‚Ä¢ down_event: ['Leftclick' 'Shift' 'q' 'Space' 'Backspace']
‚Ä¢ up_event: ['Leftclick' 'Shift' 'q' 'Space' 'Backspace']
‚Ä¢ text_change: ['NoChange' 'q' ' ' '.' ',']

üìã Summa

Unnamed: 0,activity,activity_trf
0,Nonproduction,Nonproduction


‚úÖ Sample transformed 'down_event' & 'up_event':


Unnamed: 0,down_event,down_event_trf,up_event,up_event_trf
0,Leftclick,Leftclick,Leftclick,Leftclick


Distinct transformed event types:
{'down_event_trf': 94, 'up_event_trf': 94}

‚úÖ Sample transformed 'text_change_trf':


Unnamed: 0,text_change,text_change_trf
0,NoChange,NoChange



üìã Transformation complete!
Rows: 8405898 | Columns: 15
New columns added: activity_trf, down_event_trf, up_event_trf, text_change_trf

‚Üí Shape after Step 2: (8405898, 15)

Step 3: Cleaning data...
üßπ Cleaning data...
‚úÖ Cleaned 9 text columns.
üìã Sample after cleaning:


Unnamed: 0,id,activity,down_event,up_event,text_change,activity_trf,down_event_trf,up_event_trf,text_change_trf
0,001519c8,nonproduction,leftclick,leftclick,nochange,nonproduction,leftclick,leftclick,nochange


Rows: 8405898 | Columns: 15

‚Üí Shape after Step 3: (8405898, 15)

Step 4: Aggregating event-level features...
üßÆ Aggregating essay-level behavioral features...
‚úÖ Data sorted and grouped by 'id'.
Rows: 8405898 | Columns: 15
üïí Computed basic timing and event features.


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
001519c8,2557,4526,1801969,1797443,116.246774,297243


üß≠ Added move/non-move activity stats (5 activity types).


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_nonproduction,count_input,count_cut,count_replace,count_paste
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
001519c8,2557,4526,1801969,1797443,116.246774,297243,2554,3,120,2010,417,7,0


üìè Computed move distance statistics.


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_nonproduction,count_input,count_cut,count_replace,count_paste,sum_move_distance,mean_move_distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
001519c8,2557,4526,1801969,1797443,116.246774,297243,2554,3,120,2010,417,7,0,13.0,4.333333


‚úèÔ∏è Extracted text-change and q-delta features.


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_nonproduction,count_input,count_cut,count_replace,count_paste,sum_move_distance,mean_move_distance,non_q_tc_count,q_tc_count,q_overall_delta
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
001519c8,2557,4526,1801969,1797443,116.246774,297243,2554,3,120,2010,417,7,0,13.0,4.333333,608,1949,1849


üñ±Ô∏è Added cursor and word count stats.


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_nonproduction,count_input,...,non_q_tc_count,q_tc_count,q_overall_delta,mean_cursor,std_cursor,max_cursor,final_word_count,max_word_count,min_word_count,std_word_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001519c8,2557,4526,1801969,1797443,116.246774,297243,2554,3,120,2010,...,608,1949,1849,711.163473,439.359619,1539,255,256,0,76.498372


‚öôÔ∏è Derived higher-level behavioral ratios.


Unnamed: 0,id,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_nonproduction,...,words_per_second,edit_intensity,revision_ratio,net_char_change_ratio,q_activity_ratio,cursor_movement_intensity,avg_move_distance,word_var_ratio,time_per_word,time_per_event
0,001519c8,2557,4526,1801969,1797443,116.246774,297243,2554,3,120,...,0.000142,0.212749,0.21084,7.25098,1.0,0.005084,4.333333,0.299994,7048.796078,702.949941



‚úÖ Aggregation complete!
Final shape: 2471 rows √ó 37 columns
‚Üí Shape after Step 4: (2471, 37)

Step 5: Merging with scores...
üîó Merging aggregated logs with essay scores...
‚úÖ Merge complete! Final shape: (2471, 38)
üìã Sample merged row:


Unnamed: 0,id,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_nonproduction,...,edit_intensity,revision_ratio,net_char_change_ratio,q_activity_ratio,cursor_movement_intensity,avg_move_distance,word_var_ratio,time_per_word,time_per_event,score
0,001519c8,2557,4526,1801969,1797443,116.246774,297243,2554,3,120,...,0.212749,0.21084,7.25098,1.0,0.005084,4.333333,0.299994,7048.796078,702.949941,3.5


‚Üí Shape after Step 5: (2471, 38)

‚úÖ Preprocessing pipeline completed successfully!
Final dataset shape: (2471, 38)


Unnamed: 0,id,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_nonproduction,...,edit_intensity,revision_ratio,net_char_change_ratio,q_activity_ratio,cursor_movement_intensity,avg_move_distance,word_var_ratio,time_per_word,time_per_event,score
0,001519c8,2557,4526,1801969,1797443,116.246774,297243,2554,3,120,...,0.212749,0.21084,7.25098,1.0,0.005084,4.333333,0.299994,7048.796078,702.949941,3.5
1,0022f953,2454,30623,1788969,1758346,112.221271,275391,2454,0,254,...,0.209861,0.134605,5.290625,1.0,0.0,0.0,0.30551,5494.83125,716.522412,3.5
2,0042269b,4136,4441,1771669,1767228,101.837766,421201,4136,0,175,...,0.150145,0.126849,7.683168,1.0,0.0,0.0,0.269641,4374.326733,427.279497,6.0


# 3 Essay Reconstruction

## 3.1 Reconstruction (Function)

In [11]:
import textwrap
from tqdm import tqdm
import pandas as pd

def getEssays(df, show_first=True):
    """
    Reconstructs full essay texts from event-level logs.
    Returns a Series indexed by essay IDs.
    """
    text_df = df[['id', 'activity', 'cursor_position', 'text_change']].copy()
    text_df = text_df[text_df.activity != 'Nonproduction']
    grouped = text_df.groupby('id', sort=False)

    essays = {}

    print(f"üß† Reconstructing {len(grouped)} essays...")
    for essay_id, group in tqdm(grouped, total=len(grouped), desc="Processing essays"):
        essay_text = ""
        group = group[['activity', 'cursor_position', 'text_change']].values

        for activity, cursor_pos, text_change in group:
            if activity == 'Replace':
                before, after = text_change.split(' => ')
                essay_text = essay_text[:cursor_pos - len(after)] + after + essay_text[cursor_pos - len(after) + len(before):]
                continue
            if activity == 'Paste':
                essay_text = essay_text[:cursor_pos - len(text_change)] + text_change + essay_text[cursor_pos - len(text_change):]
                continue
            if activity == 'Remove/Cut':
                essay_text = essay_text[:cursor_pos] + essay_text[cursor_pos + len(text_change):]
                continue
            if "Move" in activity:
                cropped = activity[10:]
                start, end = [seg.split(', ') for seg in cropped.split(' To ')]
                move_data = (int(start[0][1:]), int(start[1][:-1]),
                             int(end[0][1:]), int(end[1][:-1]))
                if move_data[0] != move_data[2]:
                    if move_data[0] < move_data[2]:
                        essay_text = essay_text[:move_data[0]] + essay_text[move_data[1]:move_data[3]] + essay_text[move_data[0]:move_data[1]] + essay_text[move_data[3]:]
                    else:
                        essay_text = essay_text[:move_data[2]] + essay_text[move_data[0]:move_data[1]] + essay_text[move_data[2]:move_data[0]] + essay_text[move_data[1]:]
                continue
            essay_text = essay_text[:cursor_pos - len(text_change)] + text_change + essay_text[cursor_pos - len(text_change):]

        essays[essay_id] = essay_text

    essays_series = pd.Series(essays, name='essay_text')

    # ‚úÖ Show only the first essay's text
    if show_first and not essays_series.empty:
        first_id = essays_series.index[0]
        print(f"\nüìù First reconstructed essay (ID: {first_id}):\n")
        print(textwrap.fill(essays_series.iloc[0][:1000], width=100))
        print("\n-----------------------------------------------\n")

    return essays_series

## 3.2 Derivation (Function)

In [12]:
import re, math, numpy as np, pandas as pd

def enrich_full_text_features_parallel(df, show_preview=True):
    """
    Parallelized full essay feature extractor (~84 features total).
    Combines every linguistic, structural, and punctuation-based feature
    from your raw pipeline into a single efficient parallelized pass.

    Requires: swifter (optional, auto-fallback if not installed)
    """

    # ---------- Safe import of swifter ----------
    try:
        import swifter
        use_swifter = True
        print("‚ö° Using swifter for parallel processing")
    except ImportError:
        use_swifter = False
        print("‚ÑπÔ∏è swifter not installed ‚Äî using normal .apply() (slower)")

    df = df.copy()

    # ---------- Inner per-essay feature extractor ----------
    def _extract_features(text: str):
        text = str(text)
        sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
        n_sent = len(sentences)
        paragraphs = [p.strip() for p in re.split(r"(?:\r?\n\s*\r?\n)+", text.strip()) if p.strip()]
        n_par = len(paragraphs)
        words = re.findall(r"\b[a-zA-Z]+\b", text)
        n_words = len(words)

        per100_tokens = lambda n: (n / n_words * 100.0) if n_words > 0 else 0.0
        per100_sents  = lambda n: (n / n_sent * 100.0) if n_sent > 0 else 0.0
        tok_count = lambda s: len(re.findall(r"\b[a-zA-Z]+\b", s))
        internal_punct_count = lambda s: s.count(",") + s.count(";") + s.count(":")

        # ---------- Basic counts ----------
        num_words, num_sentences, num_paragraphs = n_words, n_sent, n_par

        if n_sent:
            lengths = [tok_count(s) for s in sentences]
            mean_sentence_len = np.mean(lengths)
            std_sentence_len  = np.std(lengths)
            cv_sentence_len   = std_sentence_len / mean_sentence_len if mean_sentence_len > 0 else 0
            short_sent_share  = np.mean(np.array(lengths) <= 5)
            long_sent_share   = np.mean(np.array(lengths) >= 20)
        else:
            mean_sentence_len = std_sentence_len = cv_sentence_len = short_sent_share = long_sent_share = 0.0

        # ---------- Paragraph structure ----------
        if n_par:
            sent_per_para = [len([s for s in re.split(r"[.!?]+", p) if s.strip()]) for p in paragraphs]
            word_counts = [tok_count(p) for p in paragraphs]
            avg_sent_per_para = np.mean(sent_per_para)
            var_sent_per_para = np.var(sent_per_para)
            intro_para_len = word_counts[0]
            body_para_mean_len = np.mean(word_counts[1:-1]) if n_par > 2 else 0
            conclusion_para_len = word_counts[-1] if n_par > 1 else 0
        else:
            avg_sent_per_para = var_sent_per_para = intro_para_len = body_para_mean_len = conclusion_para_len = 0.0

        # ---------- Comma density ----------
        num_commas = text.count(",")
        commas_per_sentence = num_commas / n_sent if n_sent else 0
        commas_per_100_words = per100_tokens(num_commas)
        multi_clause_sent_share = np.mean([s.count(",") >= 2 for s in sentences]) if n_sent else 0

        # ---------- Semicolon / colon ----------
        num_semis, num_colons = text.count(";"), text.count(":")
        semicolons_per_100_tokens = per100_tokens(num_semis)
        colons_per_100_tokens = per100_tokens(num_colons)
        share_sents_with_semicolon = (sum(";" in s for s in sentences) / n_sent) if n_sent else 0
        share_sents_with_colon = (sum(":" in s for s in sentences) / n_sent) if n_sent else 0

        # ---------- Parentheses / quotes / dashes ----------
        SINGLE_QUOTES = ["'", "‚Äò", "‚Äô", "‚Äö", "‚Äõ"]
        DOUBLE_QUOTES = ['"', "‚Äú", "‚Äù", "‚Äû", "‚Äü"]
        DASHES = ["-", "‚Äì", "‚Äî"]
        left_paren, right_paren = text.count("("), text.count(")")
        parentheses = left_paren + right_paren
        single_q = sum(text.count(ch) for ch in SINGLE_QUOTES)
        double_q = sum(text.count(ch) for ch in DOUBLE_QUOTES)
        dashes = sum(text.count(ch) for ch in DASHES)
        counts = [parentheses, single_q, double_q, dashes]
        total = sum(counts)
        if total:
            p = [c / total for c in counts if c > 0]
            H = -sum(pi * math.log(pi, 2) for pi in p)
            H_norm = H / math.log(4, 2)
        else:
            H = H_norm = 0.0

        # ---------- Mechanics consistency ----------
        unmatched_parens_open = max(0, left_paren - right_paren)
        unmatched_parens_close = max(0, right_paren - left_paren)
        mismatched_parens_total = unmatched_parens_open + unmatched_parens_close

        text_no_apos = re.sub(r"(?<=\w)[\'‚Äô](?=\w)", "", text)
        straight_single = text_no_apos.count("'")
        straight_double = text_no_apos.count('"')
        unmatched_straight_single = straight_single % 2
        unmatched_straight_double = straight_double % 2
        left_single = text_no_apos.count("‚Äò")
        right_single = text_no_apos.count("‚Äô")
        left_double = text_no_apos.count("‚Äú")
        right_double = text_no_apos.count("‚Äù")
        mismatched_curly_single = abs(left_single - right_single)
        mismatched_curly_double = abs(left_double - right_double)
        mismatched_quotes_total = unmatched_straight_single + unmatched_straight_double + mismatched_curly_single + mismatched_curly_double

        def count_repeats(ch): return len(re.findall(re.escape(ch) + r"{2,}", text))
        repeated_commas = count_repeats(",")
        repeated_periods = len(re.findall(r"\.{2,}", text))
        repeated_semis = count_repeats(";")
        repeated_colons = count_repeats(":")
        repeated_qmarks = count_repeats(r"\?")
        repeated_exclaims = count_repeats("!")
        repeated_dashes = sum(count_repeats(ch) for ch in DASHES)
        repeated_punct_sequences_total = (
            repeated_commas + repeated_periods + repeated_semis + repeated_colons +
            repeated_qmarks + repeated_exclaims + repeated_dashes
        )
        repeated_punct_sequences_per_100_tokens = per100_tokens(repeated_punct_sequences_total)
        spaces_before_comma = len(re.findall(r"\s+,", text))
        spaces_before_punct_total = len(re.findall(r"\s+[,\.;:\?\!)]", text))
        spaces_before_punct_per_100_tokens = per100_tokens(spaces_before_punct_total)
        double_spaces_after_eos = len(re.findall(r"[.!?]\s{2,}", text))
        double_spaces_after_eos_per_100_sentences = per100_sents(double_spaces_after_eos)

        # ---------- Multi-clause proxy ----------
        if n_sent:
            counts_int = [internal_punct_count(s) for s in sentences]
            multi_clause_proxy_share = np.mean(np.array(counts_int) >= 2)
            any_internal_punct_share = np.mean(np.array(counts_int) >= 1)
            avg_internal_punct_per_sentence = np.mean(counts_int)
        else:
            multi_clause_proxy_share = any_internal_punct_share = avg_internal_punct_per_sentence = 0.0

        # ---------- Rhythm variety ----------
        if n_sent:
            sent_lengths = np.array([tok_count(s) for s in sentences], dtype=float)
            mean_len = sent_lengths.mean()
            std_len  = sent_lengths.std(ddof=0)
            cv_global = std_len / mean_len if mean_len > 0 else 0
            WINDOW = 5
            if n_sent < WINDOW:
                cvs = [cv_global]
            else:
                cvs = [(sent_lengths[i:i+WINDOW].std(ddof=0) /
                        sent_lengths[i:i+WINDOW].mean()) if sent_lengths[i:i+WINDOW].mean() > 0 else 0
                        for i in range(n_sent - WINDOW + 1)]
            cvs = np.array(cvs)
            cv_mw_mean   = cvs.mean()   if cvs.size else 0
            cv_mw_median = np.median(cvs) if cvs.size else 0
            cv_mw_max    = cvs.max()    if cvs.size else 0
            cv_mw_std    = cvs.std(ddof=0) if cvs.size else 0
        else:
            mean_len = std_len = cv_global = cv_mw_mean = cv_mw_median = cv_mw_max = cv_mw_std = 0.0

        # ---------- Local continuity / segmentation ----------
        para_sents = [[s.strip() for s in re.split(r"[.!?]+", p) if s.strip()] for p in paragraphs]
        if n_par == 0:
            single_sentence_paragraph_ratio = bridge_sentence_share = bridge_sentences_per_100_sentences = \
            heavy_internal_punct_sentence_share = heavy_at_paragraph_edges_share = heavy_sentence_mean_normalized_position = \
            semicolon_sentence_share = semicolon_at_paragraph_edges_share = colon_sentence_share = colon_at_paragraph_edges_share = 0.0
        else:
            single_sentence_paragraph_ratio = sum(len(ps) == 1 for ps in para_sents) / n_par
            all_sents = [s for ps in para_sents for s in ps]
            n_sent_total = len(all_sents)
            sent_lengths_all = [tok_count(s) for s in all_sents]
            bridge_flags = np.array(sent_lengths_all) <= 5
            bridge_sentence_share = bridge_flags.mean() if bridge_flags.size else 0
            bridge_sentences_per_100_sentences = bridge_sentence_share * 100
            heavy_flags = [internal_punct_count(s) >= 2 or ";" in s or ":" in s for s in all_sents]
            heavy_internal_punct_sentence_share = np.mean(heavy_flags) if n_sent_total else 0
            sent_meta = [(p_idx, i, len(ps)) for p_idx, ps in enumerate(para_sents) for i, _ in enumerate(ps)]
            heavy_idx = [i for i, h in enumerate(heavy_flags) if h]
            heavy_edges = sum(1 for gi in heavy_idx if sent_meta[gi][1] in (0, sent_meta[gi][2]-1))
            heavy_at_paragraph_edges_share = heavy_edges / len(heavy_idx) if heavy_idx else 0
            heavy_sentence_mean_normalized_position = np.mean([i/(n_sent_total-1) for i,h in enumerate(heavy_flags) if h]) if n_sent_total>1 else 0
            semi_flags = [";" in s for s in all_sents]
            colon_flags = [":" in s for s in all_sents]
            def edge_share(mask):
                idxs = [i for i,f in enumerate(mask) if f]
                return sum(1 for gi in idxs if sent_meta[gi][1] in (0, sent_meta[gi][2]-1)) / len(idxs) if idxs else 0
            semicolon_sentence_share = np.mean(semi_flags) if n_sent_total else 0
            semicolon_at_paragraph_edges_share = edge_share(semi_flags)
            colon_sentence_share = np.mean(colon_flags) if n_sent_total else 0
            colon_at_paragraph_edges_share = edge_share(colon_flags)

        # ---------- List / explanation patterns ----------
        colon_sents = [s for s in sentences if ":" in s]
        n_colon = len(colon_sents)
        if n_sent == 0:
            colon_sentence_share2 = list_like_all = list_like_among = semi_tail_share = avg_trailing = \
            items_mean = items_median = items_max = items_ge3 = 0.0
        else:
            colon_sentence_share2 = n_colon / n_sent
            list_like_flags, semi_tail_flags, trailing_counts, items_counts = [], [], [], []
            for s in colon_sents:
                _, tail = s.split(":", 1)
                commas, semis = tail.count(","), tail.count(";")
                total_internal = commas + semis
                trailing_counts.append(total_internal)
                semi_tail_flags.append(semis > 0)
                list_like_flags.append(total_internal >= 2)
                segments = [seg.strip() for seg in re.split(r"[;,]", tail)]
                items = [seg for seg in segments if re.search(r"\b[a-zA-Z]+\b", seg)]
                items_counts.append(len(items))
            list_like_all   = np.mean(list_like_flags) if n_sent else 0
            list_like_among = np.mean(list_like_flags) if n_colon else 0
            semi_tail_share = np.mean(semi_tail_flags) if n_colon else 0
            avg_trailing = np.mean(trailing_counts) if trailing_counts else 0
            items_mean   = np.mean(items_counts) if items_counts else 0
            items_median = np.median(items_counts) if items_counts else 0
            items_max    = np.max(items_counts) if items_counts else 0
            items_ge3    = np.mean(np.array(items_counts) >= 3) if items_counts else 0

        return pd.Series([
            num_words,num_sentences,num_paragraphs,
            mean_sentence_len,std_sentence_len,cv_sentence_len,short_sent_share,long_sent_share,
            avg_sent_per_para,var_sent_per_para,intro_para_len,body_para_mean_len,conclusion_para_len,
            commas_per_sentence,commas_per_100_words,multi_clause_sent_share,
            semicolons_per_100_tokens,colons_per_100_tokens,share_sents_with_semicolon,share_sents_with_colon,
            parentheses,left_paren,right_paren,single_q,double_q,dashes,H,H_norm,
            unmatched_parens_open,unmatched_parens_close,mismatched_parens_total,
            unmatched_straight_single,unmatched_straight_double,mismatched_curly_single,mismatched_curly_double,mismatched_quotes_total,
            repeated_commas,repeated_periods,repeated_semis,repeated_colons,repeated_qmarks,repeated_exclaims,repeated_dashes,
            repeated_punct_sequences_total,repeated_punct_sequences_per_100_tokens,
            spaces_before_comma,spaces_before_punct_total,spaces_before_punct_per_100_tokens,
            double_spaces_after_eos,double_spaces_after_eos_per_100_sentences,
            multi_clause_proxy_share,any_internal_punct_share,avg_internal_punct_per_sentence,
            mean_len,std_len,cv_global,cv_mw_mean,cv_mw_median,cv_mw_max,cv_mw_std,
            num_paragraphs,single_sentence_paragraph_ratio,bridge_sentence_share,bridge_sentences_per_100_sentences,
            heavy_internal_punct_sentence_share,heavy_at_paragraph_edges_share,heavy_sentence_mean_normalized_position,
            semicolon_sentence_share,semicolon_at_paragraph_edges_share,colon_sentence_share,colon_at_paragraph_edges_share,
            colon_sentence_share2,list_like_all,list_like_among,semi_tail_share,avg_trailing,
            items_mean,items_median,items_max,items_ge3
        ])

    # ---------- Column names ----------
    cols = [
        'num_words','num_sentences','num_paragraphs',
        'mean_sentence_len','std_sentence_len','cv_sentence_len','short_sent_share','long_sent_share',
        'avg_sent_per_para','var_sent_per_para','intro_para_len','body_para_mean_len','conclusion_para_len',
        'commas_per_sentence','commas_per_100_words','multi_clause_sent_share',
        'semicolons_per_100_tokens','colons_per_100_tokens','share_sents_with_semicolon','share_sents_with_colon',
        'parentheses_count','left_parentheses_count','right_parentheses_count','single_quotes_count','double_quotes_count','dashes_count','punct_diversity_shannon','punct_diversity_shannon_norm',
        'unmatched_parens_open','unmatched_parens_close','mismatched_parens_total',
        'unmatched_quotes_straight_single','unmatched_quotes_straight_double','mismatched_quotes_curly_single','mismatched_quotes_curly_double','mismatched_quotes_total',
        'repeated_commas_seq','repeated_periods_seq','repeated_semicolons_seq','repeated_colons_seq','repeated_qmarks_seq','repeated_exclaims_seq','repeated_dashes_seq',
        'repeated_punct_sequences_total','repeated_punct_sequences_per_100_tokens',
        'spaces_before_comma','spaces_before_punct_total','spaces_before_punct_per_100_tokens',
        'double_spaces_after_eos','double_spaces_after_eos_per_100_sentences',
        'multi_clause_proxy_share','any_internal_punct_share','avg_internal_punct_per_sentence',
        'sent_len_tokens_mean','sent_len_tokens_std','sent_len_tokens_cv_global',
        'sent_len_tokens_cv_mw_mean','sent_len_tokens_cv_mw_median','sent_len_tokens_cv_mw_max','sent_len_tokens_cv_mw_std',
        'num_paragraphs','single_sentence_paragraph_ratio','bridge_sentence_share','bridge_sentences_per_100_sentences',
        'heavy_internal_punct_sentence_share','heavy_at_paragraph_edges_share','heavy_sentence_mean_normalized_position',
        'semicolon_sentence_share','semicolon_at_paragraph_edges_share','colon_sentence_share','colon_at_paragraph_edges_share',
        'colon_sentence_share_2','list_like_colon_sentence_share_all','list_like_colon_sentence_share_among_colon',
        'semicolon_in_tail_share_among_colon','avg_trailing_commas_semis_per_colon_sent',
        'items_after_colon_mean','items_after_colon_median','items_after_colon_max','items_ge3_share_among_colon'
    ]

    # ---------- Parallel apply ----------
    if use_swifter:
        feature_df = df["essay_text"].swifter.progress_bar(True).apply(_extract_features)
    else:
        feature_df = df["essay_text"].apply(_extract_features)

    feature_df.columns = cols
    df_out = pd.concat([df.reset_index(drop=True), feature_df], axis=1)

    if show_preview:
        print(f"‚úÖ Feature enrichment complete: {len(cols)} new columns added for {len(df_out)} essays.")
        display(df_out.head(2)[['num_words','mean_sentence_len','multi_clause_proxy_share','items_after_colon_mean']])

    return df_out

## 3.3 Output df_train_recon_logs

In [13]:
## !pip install swifter --quiet

In [14]:
# ==========================================================
# üöÄ FULL ESSAY RECONSTRUCTION + FEATURE ENRICHMENT PIPELINE
# ==========================================================

# ‚úÖ Step 0: Start from original logs
df_train_logs_copy = df_train_logs.copy()
print("üìò Step 0: Original df_train_logs shape:", df_train_logs_copy.shape)

# ‚úÖ Step 1: Essay reconstruction
df_train_recon_logs_raw = getEssays(df_train_logs_copy.copy()).to_frame(name='essay_text')
df_train_recon_logs_raw.index.name = 'id'
df_train_recon_logs_raw.reset_index(inplace=True)  # ensure 'id' is a proper column
print("‚úÖ Step 1: Essays reconstructed ‚Äî shape:", df_train_recon_logs_raw.shape)

# ==========================================================
# ‚úÖ Step 2: Unified feature enrichment (parallelized)
# ==========================================================
print("‚öôÔ∏è Step 2: Extracting full linguistic + structural + mechanics features (parallelized)...")
df_train_recon_logs = enrich_full_text_features_parallel(df_train_recon_logs_raw.copy(), show_preview=False)
print("‚úÖ Step 2: Feature enrichment complete ‚Äî shape:", df_train_recon_logs.shape)

# ==========================================================
# ‚úÖ Step 3: Check for duplicate columns
# ==========================================================
dupes = df_train_recon_logs.columns[df_train_recon_logs.columns.duplicated()]

if len(dupes) > 0:
    from collections import Counter
    dupe_counts = Counter(dupes)
    print(f"\n‚ö†Ô∏è Found {len(dupe_counts)} duplicate column names:")
    for name, count in list(dupe_counts.items())[:15]:
        print(f"   üß© {name} ‚Üí appears {count} times")
    if len(dupe_counts) > 15:
        print("   ... (truncated)")
    
    # Drop duplicates (keep first occurrence)
    before = df_train_recon_logs.shape[1]
    df_train_recon_logs = df_train_recon_logs.loc[:, ~df_train_recon_logs.columns.duplicated()]
    after = df_train_recon_logs.shape[1]
    print(f"üßπ Removed {before - after} duplicate columns. Final shape: {df_train_recon_logs.shape}")
else:
    print("\n‚úÖ No duplicate columns detected in df_train_recon_logs.")

# ==========================================================
# ‚úÖ SUMMARY
# ==========================================================
print("\nüéØ Pipeline complete! Final dataset ‚Üí df_train_recon_logs")
print(f"üß© Step 0: df_train_logs_copy shape: {df_train_logs_copy.shape}")
print(f"üß© Step 1: df_train_recon_logs_raw shape: {df_train_recon_logs_raw.shape}")
print(f"üß© Step 2: df_train_recon_logs (final) shape: {df_train_recon_logs.shape}")

# üß† Sanity check
print("üß† Total essays:", df_train_recon_logs.shape[0])
print("üß© Total new features:", df_train_recon_logs.shape[1] - 2)  # exclude id + essay_text

# ‚úÖ Optional preview
display(df_train_recon_logs.head(2)[['id', 'essay_text'] + df_train_recon_logs.columns[2:12].tolist()])

# (Optional) Save for reuse
# df_train_recon_logs.to_csv("/kaggle/working/df_train_recon_logs.csv", index=False)
# print("üíæ Saved df_train_recon_logs.csv")

üìò Step 0: Original df_train_logs shape: (8405898, 11)
üß† Reconstructing 2471 essays...


Processing essays: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2471/2471 [00:10<00:00, 242.43it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



üìù First reconstructed essay (ID: 001519c8):

qqqqqqqqq qq qqqqq qq qqqq qqqq.  qqqqqq qqq qqqq qqqqqq qq qq qqqqq qq qqqq qqqqq qq qqqqqqqqq
qqqqq qqqq qqqqq qqq qqqqqqqqq qqqqqqqqq qqqq.  qqqqqq qqq qqqqq qqq qqqqqqqqqqq qq qqq qqqqqqqqqq
qqqqq, qqq qqqqq qqqqqq qq qq qqqq qqq qqqqqq qqqqqqq qq qqq qqqqqqqqqqq.  qqqqqqqq qq qqqqqqqqqq
qqqq qqqq qqqqqqqqq qqq qqqqqqq qq qqqqqq qqqq qqq qqq qq qqqqqqqqq qq qq qqq qqqqq qqqqq qq qqq.
qq qq qqqq qqqq qqq qqqqqqqqq qqq qqqqqqq qq qqq qqqqq qqqqq, qq qq qqqqqq qqq qqq qqqqqqqq qqqqq qq
qqq qqqqqqqqqqq qq qqqqqqqqq.  qqqqqqqqq qq qqq qqqqqqqq qqqq qq qqqq qq qqqqqqq qqqqq qqqqq, qqq
qqqqqq qqqqq qqqqq qqq qqq qq qqq qqqqqqq qqqqqqq qqqq.  qqqq qqqqq qqqqq qqqq qqqq'qq qqqqq
qqqqqqqqq qqqqq qqqqqqq qqqqqqq qqqqqqqqqq, qqqq qq qqqqqqqqqq qqqqqqq qqq qqqqqqq; qqqqqqq, qqqqq
qqqqqqqq qqqqqq qqqqqqq qqqqqqq qqq qqqqq qqq qqq qqq qqqqqqq.  qqqq qqqqqqqqq qqqq qqq qqqq qqqq
qqqqq qqqqqqqqqq qqqq qqqqq qqqqq.  qqq qqqqqqqqqq qq qqqqqqqq q qqqqqq

Unnamed: 0,id,essay_text,num_words,num_sentences,num_paragraphs,mean_sentence_len,std_sentence_len,cv_sentence_len,short_sent_share,long_sent_share,avg_sent_per_para,var_sent_per_para
0,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...,257.0,14.0,3.0,18.357143,6.387568,0.347961,0.0,0.571429,4.666667,0.888889
1,0022f953,"qqqq qq qqqqqqqqqqq ? qq qq qqq qqq qqq, qqqqq...",324.0,15.0,1.0,21.6,12.408599,0.574472,0.066667,0.533333,15.0,0.0


# 4 DeBERTa Metrics

## 4.1 DeBERTa Obtain Embeddings (Function)

In [15]:
import os, torch, numpy as np, pandas as pd
from transformers import AutoTokenizer, AutoModel
from math import ceil
from pathlib import Path

def add_deberta_embeddings(
    df,
    model_dir="/kaggle/input/deberta-v3-fast-tokenizer/deb-v3",
    text_col="essay_text",
    id_col="id",
    max_len=256,
    batch_size=8,
    show_preview=True
):
    """
    Adds mean-pooled DeBERTa-v3 embeddings as new columns (deb_emb_0 ... deb_emb_767)
    to a DataFrame containing essays.

    Internal behavior:
      - Replaces all standalone 'q' tokens with 'i' *only for embedding computation*
      - Original text in df is NOT modified or returned altered
    """

    assert {id_col, text_col}.issubset(df.columns), f"Missing {id_col} or {text_col}"
    df = df.copy()  # prevent in-place mutation

    # =====================
    # ‚öôÔ∏è Setup
    # =====================
    os.environ["HF_HUB_OFFLINE"] = "1"
    os.environ["TRANSFORMERS_OFFLINE"] = "1"

    print(f"\nüîß Loading DeBERTa model from: {model_dir}")
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print("üíª Device:", DEVICE)

    tok = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
    model = AutoModel.from_pretrained(model_dir, local_files_only=True).to(DEVICE).eval()

    # =====================
    # üß† Internal helper: q ‚Üí i replacement
    # =====================
    @torch.inference_mode()
    def preprocess_texts(texts):
        """
        Replace lowercase standalone 'q' with 'i' before embedding.
        This change is temporary and not persisted to the DataFrame.
        """
        return [pd.Series(t).astype(str).str.replace(r'\bq\b', 'i', regex=True).iloc[0].strip() for t in texts]

    @torch.inference_mode()
    def masked_mean_pool(last_hidden_state, mask):
        mask = mask.unsqueeze(-1)
        summed = (last_hidden_state * mask).sum(1)
        count = mask.sum(1).clamp(min=1e-9)
        return summed / count

    @torch.inference_mode()
    def embed_texts(texts):
        all_embs = []
        n = len(texts)
        for b in range(ceil(n / batch_size)):
            batch = texts[b * batch_size:(b + 1) * batch_size]
            enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt").to(DEVICE)
            out = model(**enc)
            pooled = masked_mean_pool(out.last_hidden_state, enc["attention_mask"])
            all_embs.append(pooled.cpu().numpy())
        return np.vstack(all_embs).astype("float32")

    # =====================
    # üß© Embed essays (with temporary cleaned text)
    # =====================
    texts_original = df[text_col].astype(str).tolist()
    texts_cleaned  = preprocess_texts(texts_original)  # temporary replacement
    ids = df[id_col].values

    print(f"\nüìù Embedding {len(texts_cleaned)} essays | max_len={max_len}, batch_size={batch_size}")

    X_emb = embed_texts(texts_cleaned)
    emb_df = pd.DataFrame(X_emb, columns=[f"deb_emb_{i}" for i in range(X_emb.shape[1])])
    emb_df[id_col] = ids

    df_out = df.merge(emb_df, on=id_col, how="left")

    # =====================
    # ‚úÖ Verification output
    # =====================
    print(f"‚úÖ Done! Added {X_emb.shape[1]} embedding columns.")
    print(f"üìä Output shape: {df_out.shape}")
    if show_preview:
        print("\nüîç Preview of first 2 rows and first 5 embedding dims:")
        display(df_out[[id_col, text_col] + [f"deb_emb_{i}" for i in range(5)]].head(2))

    return df_out

## 4.2 Output df_train_recon_D_logs

In [16]:
# ==========================================================
#  STEP: Add DeBERTa Embeddings to Reconstructed Essays
# ==========================================================

print("‚öôÔ∏è  Applying DeBERTa embeddings to training essays...")

# Apply to df_train_recon_logs
df_train_recon_D_logs = add_deberta_embeddings(df_train_recon_logs.copy())

# ‚úÖ Verification
print("\n‚úÖ Embedding process complete!")
print("üìä Final DataFrame shape:", df_train_recon_D_logs.shape)
print("üß© Sample of new columns added:")
print([col for col in df_train_recon_D_logs.columns if col.startswith("deb_emb_")][:10])

# Optional ‚Äî sanity check for alignment
id_check = df_train_recon_D_logs["id"].equals(df_train_recon_logs["id"])
print(f"üîç ID alignment check passed? {id_check}")

# Preview
display(df_train_recon_D_logs.head(2))

‚öôÔ∏è  Applying DeBERTa embeddings to training essays...

üîß Loading DeBERTa model from: /kaggle/input/deberta-v3-fast-tokenizer/deb-v3
üíª Device: cuda


2025-11-08 18:22:36.937141: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762626157.116450      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762626157.166150      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



üìù Embedding 2471 essays | max_len=256, batch_size=8
‚úÖ Done! Added 768 embedding columns.
üìä Output shape: (2471, 849)

üîç Preview of first 2 rows and first 5 embedding dims:


Unnamed: 0,id,essay_text,deb_emb_0,deb_emb_1,deb_emb_2,deb_emb_3,deb_emb_4
0,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...,0.375177,-0.083705,0.11933,-0.391738,-0.018285
1,0022f953,"qqqq qq qqqqqqqqqqq ? qq qq qqq qqq qqq, qqqqq...",0.274797,-0.040442,0.05093,-0.388414,-0.066552



‚úÖ Embedding process complete!
üìä Final DataFrame shape: (2471, 849)
üß© Sample of new columns added:
['deb_emb_0', 'deb_emb_1', 'deb_emb_2', 'deb_emb_3', 'deb_emb_4', 'deb_emb_5', 'deb_emb_6', 'deb_emb_7', 'deb_emb_8', 'deb_emb_9']
üîç ID alignment check passed? True


Unnamed: 0,id,essay_text,num_words,num_sentences,num_paragraphs,mean_sentence_len,std_sentence_len,cv_sentence_len,short_sent_share,long_sent_share,...,deb_emb_758,deb_emb_759,deb_emb_760,deb_emb_761,deb_emb_762,deb_emb_763,deb_emb_764,deb_emb_765,deb_emb_766,deb_emb_767
0,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...,257.0,14.0,3.0,18.357143,6.387568,0.347961,0.0,0.571429,...,-0.020243,0.145127,-0.257789,0.342048,0.134046,-0.179942,-0.188461,1.465361,0.258102,-0.217699
1,0022f953,"qqqq qq qqqqqqqqqqq ? qq qq qqq qqq qqq, qqqqq...",324.0,15.0,1.0,21.6,12.408599,0.574472,0.066667,0.533333,...,-0.074062,0.071422,-0.232852,0.328209,0.219393,-0.20593,-0.182424,1.405206,0.224473,-0.254079


# 5 Finalising Phase

## 5.1 Merge Outputs of 2 and 4 (Function)

In [17]:
def merge_agg_and_deberta(df_agg, df_recon_D):
    """
    General-purpose merge of aggregated event-level logs and reconstructed essay
    datasets (with DeBERTa embeddings).

    Parameters
    ----------
    df_agg : pd.DataFrame
        Aggregated logs DataFrame (e.g., df_train_agg_logs or df_test_agg_logs)
    df_recon_D : pd.DataFrame
        Reconstructed essay DataFrame with linguistic + DeBERTa embeddings
        (e.g., df_train_recon_D_logs or df_test_recon_D_logs)

    Returns
    -------
    pd.DataFrame
        Merged dataset (e.g., df_train_full or df_test_full)
    """

    print("\n==========================================================")
    print("üöÄ FINAL MERGE: Aggregated Logs + Reconstructed DeBERTa Essays")
    print("==========================================================\n")

    # --- Make copies to avoid in-place modification ---
    df1 = df_agg.copy()
    df2 = df_recon_D.copy()

    # --- Step 0: Sanity check for 'id' column existence ---
    for name, df in zip(["Aggregated logs", "Reconstructed + DeBERTa"], [df1, df2]):
        if "id" not in df.columns:
            raise KeyError(f"‚ùå '{name}' missing 'id' column!")
        print(f"‚úÖ {name} shape: {df.shape}")
    print()

    # ==========================================================
    # STEP 1: Check essay_text alignment (optional, if exists)
    # ==========================================================
    if "essay_text" in df1.columns and "essay_text" in df2.columns:
        mismatch_mask = df1.set_index("id")["essay_text"] != df2.set_index("id")["essay_text"]
        mismatch_count = mismatch_mask.sum()
        if mismatch_count == 0:
            print("‚úÖ Essay text perfectly aligned ‚Äî using ['id', 'essay_text'] as merge keys.")
            join_cols = ["id", "essay_text"]
        else:
            print(f"‚ö†Ô∏è Essay text mismatch in {mismatch_count} rows ‚Äî using 'id' only.")
            join_cols = ["id"]
    else:
        print("‚öôÔ∏è Using 'id' as merge key (no essay_text overlap).")
        join_cols = ["id"]

    # ==========================================================
    # STEP 2: Perform merge
    # ==========================================================
    try:
        df_full = pd.merge(df1, df2, on=join_cols, how="left", validate="1:1")
        print(f"üìé Merge successful on {join_cols}. Shape: {df_full.shape}")
    except Exception as e:
        print(f"‚ùå Merge on {join_cols} failed: {e}")
        print("üîÅ Retrying merge on 'id' only...")
        df_full = pd.merge(df1, df2, on="id", how="left", validate="1:1")
        print(f"‚úÖ Fallback merge succeeded. Shape: {df_full.shape}")

    # ==========================================================
    # STEP 3: Drop duplicate columns automatically
    # ==========================================================
    dupes = df_full.columns[df_full.columns.duplicated()]
    if len(dupes) > 0:
        print(f"\n‚ö†Ô∏è Found {len(dupes)} duplicate column names:")
        print("   üß©", list(dupes[:10]), "..." if len(dupes) > 10 else "")
        df_full = df_full.loc[:, ~df_full.columns.duplicated()]
        print(f"üßπ Duplicates removed. Final shape: {df_full.shape}")
    else:
        print("\n‚úÖ No duplicate columns detected in merged dataset.")

    # ==========================================================
    # STEP 4: Alignment check
    # ==========================================================
    same_ids = df_full["id"].equals(df1["id"])
    missing_from_merge = df1[~df1["id"].isin(df_full["id"])]

    print("\nüîç Alignment verification:")
    print(f" - ID alignment maintained? {same_ids}")
    print(f" - Missing IDs after merge: {len(missing_from_merge)}")

    # ==========================================================
    # ‚úÖ Final summary
    # ==========================================================
    print("\nüéØ Merge completed successfully!")
    print(f"üìä Final merged dataset shape: {df_full.shape}")
    print(f"üîó Merge keys used: {join_cols}")

    # --- Optional preview ---
    display(df_full.head(2))

    # --- Optional essay text check ---
    essay_cols = [col for col in df_full.columns if "essay_text" in col]
    print(f"\nüß© Essay text-related columns: {essay_cols}")

    return df_full

## 5.2 Perform Sanity Check (outputs df_train_full_checked)

In [18]:
import numpy as np
import pandas as pd

def prepare_dataframe(df: pd.DataFrame, target_col: str = None):
    """
    General-purpose DataFrame preparation function.
    Performs:
      - Ensures 'id' is a column (not index)
      - Checks duplicate columns and duplicate IDs
      - Optionally coerces target to numeric (if provided)
      - Detects which columns have NaNs or Infs
      - Replaces inf / -inf with NaN and fills NaN with 0 (features only)
      - Leaves 'id' and 'essay_text' untouched

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame (train or test)
    target_col : str or None
        Target column name (e.g. 'score' for train).
        If None, skips target-related checks.

    Returns
    -------
    pd.DataFrame
        Cleaned and ready DataFrame.
    """
    print("üßπ Stage 0: Data Preparation & Sanity Checks")

    df = df.copy()

    # ======================================================
    # üÜî Ensure 'id' column exists
    # ======================================================
    if df.index.name == "id" or "id" not in df.columns:
        if df.index.name == "id":
            df = df.reset_index()
            print("‚Ü™Ô∏è  Reset index: moved 'id' from index to column.")
    if "id" not in df.columns:
        raise KeyError("‚ùå Missing required column: 'id'")

    # ======================================================
    # ‚ö†Ô∏è Handle duplicates
    # ======================================================
    dup_cols = df.columns[df.columns.duplicated()].tolist()
    if dup_cols:
        print(f"‚ö†Ô∏è Found duplicate columns (kept first occurrence): {dup_cols}")
        df = df.loc[:, ~df.columns.duplicated()]

    dup_ids = df["id"][df["id"].duplicated()].unique()
    if len(dup_ids) > 0:
        print(f"‚ö†Ô∏è Found {len(dup_ids)} duplicated IDs. Keeping first occurrence.")
        df = df.drop_duplicates(subset=["id"], keep="first")

    # ======================================================
    # üéØ Target column (optional)
    # ======================================================
    if target_col:
        if target_col not in df.columns:
            raise KeyError(f"‚ùå Missing target column: '{target_col}'")

        before_non_numeric = df[target_col].dtype
        df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
        if str(before_non_numeric) != str(df[target_col].dtype):
            print(f"‚ÑπÔ∏è  Coerced '{target_col}' from {before_non_numeric} ‚Üí {df[target_col].dtype}")

    # ======================================================
    # üß© Feature subset (exclude protected columns)
    # ======================================================
    protect_cols = {"id", "essay_text"}
    if target_col:
        protect_cols.add(target_col)
    feature_cols = [c for c in df.columns if c not in protect_cols]

    # ======================================================
    # üîç Detect NaNs and Infs before cleaning
    # ======================================================
    inf_mask = np.isinf(df[feature_cols].to_numpy())
    inf_cols = df[feature_cols].columns[np.any(inf_mask, axis=0)].tolist()

    nan_mask = df[feature_cols].isna()
    nan_cols = nan_mask.columns[nan_mask.any()].tolist()

    inf_count = np.isinf(df[feature_cols].to_numpy()).sum()
    nan_count = df[feature_cols].isna().sum().sum()

    if inf_count or nan_count:
        print(f"‚ö†Ô∏è Detected issues in feature columns:")
        if inf_count:
            print(f"   ‚àû Infs: {inf_count} total, in {len(inf_cols)} columns.")
            print(f"      ‚Ü≥ Columns with inf values: {inf_cols[:10]}{' ...' if len(inf_cols) > 10 else ''}")
        if nan_count:
            print(f"   üï≥Ô∏è NaNs: {nan_count} total, in {len(nan_cols)} columns.")
            nan_counts_per_col = df[feature_cols].isna().sum()
            nan_counts_top = nan_counts_per_col[nan_counts_per_col > 0].sort_values(ascending=False).head(10)
            print("      ‚Ü≥ Top NaN columns (count):")
            for col, cnt in nan_counts_top.items():
                print(f"         - {col}: {cnt}")
        print("‚Ü™Ô∏è  Cleaning features: replacing inf ‚Üí NaN ‚Üí 0")
    else:
        print("‚úÖ No NaN or inf values detected in feature columns.")

    # ======================================================
    # üßπ Replace inf and NaN
    # ======================================================
    df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

    # ======================================================
    # üìä Summary
    # ======================================================
    print(f"\n‚úÖ Data ready. Shape: {df.shape}")
    print(f"üî¢ Features (excl. protected cols): {len(feature_cols)}")
    print(f"üÜî Unique IDs: {df['id'].nunique()}  |  Rows: {len(df)}")

    if target_col:
        print(f"üéØ Target '{target_col}' ‚Äî min: {df[target_col].min():.4f}, max: {df[target_col].max():.4f}")

    return df

## 5.3 Output df_train_full

In [19]:
# Merge the two DataFrames on 'id' first
df_train_full_unchecked = pd.merge(df_train_agg_logs, df_train_recon_D_logs, on="id", how="left")

# Apply the prepare_dataframe function to the merged DataFrame
df_train_full = prepare_dataframe(df_train_full_unchecked.copy(), target_col=None)  # No target column for merged features

# ‚úÖ Final check and summary
print(f"\nüéØ Final merged and checked df_train_full shape: {df_train_full.shape}")
display(df_train_full.head(2))

üßπ Stage 0: Data Preparation & Sanity Checks
‚ö†Ô∏è Detected issues in feature columns:
   üï≥Ô∏è NaNs: 2678 total, in 2 columns.
      ‚Ü≥ Top NaN columns (count):
         - list_like_colon_sentence_share_all: 2216
         - heavy_sentence_mean_normalized_position: 462
‚Ü™Ô∏è  Cleaning features: replacing inf ‚Üí NaN ‚Üí 0

‚úÖ Data ready. Shape: (2471, 886)
üî¢ Features (excl. protected cols): 884
üÜî Unique IDs: 2471  |  Rows: 2471

üéØ Final merged and checked df_train_full shape: (2471, 886)


Unnamed: 0,id,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_nonproduction,...,deb_emb_758,deb_emb_759,deb_emb_760,deb_emb_761,deb_emb_762,deb_emb_763,deb_emb_764,deb_emb_765,deb_emb_766,deb_emb_767
0,001519c8,2557,4526,1801969,1797443,116.246774,297243,2554,3,120,...,-0.020243,0.145127,-0.257789,0.342048,0.134046,-0.179942,-0.188461,1.465361,0.258102,-0.217699
1,0022f953,2454,30623,1788969,1758346,112.221271,275391,2454,0,254,...,-0.074062,0.071422,-0.232852,0.328209,0.219393,-0.20593,-0.182424,1.405206,0.224473,-0.254079


# 6 Training Phase

## 6.1 Define LGB, XGB, CB

### LGB

In [20]:
def run_lightgbm_multi_seed(
    df,
    seeds=range(5),
    n_splits=5,
    n_top=25,
    verbose=True
):
    """
    ‚ö° LightGBM Multi-Seed CV Trainer (Clean Output Version)
    --------------------------------------------------------
    - Uses StratifiedKFold with discrete score bins (0.5‚Äì6.0)
    - Produces Out-Of-Fold predictions for stacking
    - Silences internal LightGBM C++ warnings + Python warnings
    - Returns models, OOF preds, feature importances, and CV stats
    """

    import lightgbm as lgb
    import numpy as np, pandas as pd, gc, torch, time, warnings, sys, os
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import mean_squared_error
    from IPython.display import display

    # =========================================================
    # ‚öôÔ∏è Silence noisy warnings globally
    # =========================================================
    warnings.filterwarnings("ignore", category=UserWarning)
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)

    # Temporarily redirect stdout (LightGBM C++ logs)
    class suppress_stdout_stderr:
        def __enter__(self):
            self.null_fds = [os.open(os.devnull, os.O_RDWR) for _ in range(2)]
            self.save_fds = [os.dup(1), os.dup(2)]
            os.dup2(self.null_fds[0], 1)
            os.dup2(self.null_fds[1], 2)
        def __exit__(self, *_):
            os.dup2(self.save_fds[0], 1)
            os.dup2(self.save_fds[1], 2)
            for fd in self.null_fds + self.save_fds:
                os.close(fd)

    start_time = time.time()

    # =========================================================
    # ‚öôÔ∏è Detect device (GPU or CPU)
    # =========================================================
    device_type = "gpu" if torch.cuda.is_available() else "cpu"
    print(f"üíª Using {device_type.upper()} ({torch.cuda.get_device_name(0) if device_type=='gpu' else 'CPU only'})\n")

    # =========================================================
    # üßπ Prepare data
    # =========================================================
    df = df.copy()
    y = df["score"].astype(float).values
    X = (
        df.drop(columns=["id", "score", "essay_text"], errors="ignore")
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    features = X.columns.tolist()
    n_samples, n_features = X.shape
    print(f"üìä Loaded data: {n_samples:,} samples √ó {n_features:,} features\n")

    if n_features == 0:
        raise ValueError("‚ùå No valid features found for LightGBM training.")

    # =========================================================
    # üîß LightGBM hyperparameters
    # =========================================================
    params = dict(
        objective="regression",
        metric="rmse",
        learning_rate=0.01,
        num_leaves=31,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=5,
        lambda_l1=0.1,
        lambda_l2=0.1,
        verbosity=-1,     # <- disable internal logs
        n_jobs=-1,
        device_type=device_type,
    )

    if device_type == "gpu":
        params.update(dict(max_bin=255, gpu_platform_id=0, gpu_device_id=0))

    # =========================================================
    # üöÄ Multi-seed Stratified CV
    # =========================================================
    all_rmse, all_models, all_importances = [], [], []
    oof_preds = np.zeros(len(X))

    print(f"üöÄ Starting LightGBM CV: {len(seeds)} seeds √ó {n_splits}-folds (stratified by score)\n")

    y_bins = (y * 2).astype(int)

    for s_i, seed in enumerate(seeds, 1):
        seed_start = time.time()
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

        fold_rmse, fold_models = [], []
        print(f"üå± Seed {seed} ({s_i}/{len(seeds)})")

        for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y_bins), 1):
            fold_start = time.time()
            train_set = lgb.Dataset(X.iloc[tr_idx], label=y[tr_idx])
            val_set   = lgb.Dataset(X.iloc[va_idx], label=y[va_idx])

            with suppress_stdout_stderr():  # silence LightGBM's internal C++ logs
                model = lgb.train(
                    params,
                    train_set,
                    num_boost_round=5000,
                    valid_sets=[val_set],
                    callbacks=[
                        lgb.early_stopping(100, verbose=False),
                        lgb.log_evaluation(0)  # <- disable eval logging
                    ]
                )

            preds = model.predict(X.iloc[va_idx], num_iteration=model.best_iteration)
            oof_preds[va_idx] += preds / len(seeds)
            rmse = mean_squared_error(y[va_idx], preds, squared=False)
            fold_rmse.append(rmse)
            fold_models.append(model)

            print(f"   ‚úÖ Fold {fold}/{n_splits}: RMSE={rmse:.4f} | BestIter={model.best_iteration} | ‚è± {(time.time()-fold_start):.1f}s")

        mean_rmse, std_rmse = np.mean(fold_rmse), np.std(fold_rmse)
        all_rmse.append(mean_rmse)
        all_models.extend(fold_models)

        # Aggregate feature importances across folds
        imp = np.mean([m.feature_importance("gain") for m in fold_models], axis=0)
        all_importances.append(pd.DataFrame({"feature": features, "importance": imp, "seed": seed}))

        print(f"üåæ Seed {seed} complete ‚Üí RMSE={mean_rmse:.4f} ¬± {std_rmse:.4f} | ‚è± {(time.time()-seed_start)/60:.2f} min\n")
        gc.collect()

    # =========================================================
    # üßÆ Aggregate feature importances
    # =========================================================
    avg_imp = (
        pd.concat(all_importances)
        .groupby("feature", as_index=False)["importance"]
        .mean()
        .sort_values("importance", ascending=False)
        .reset_index(drop=True)
    )

    total_min = (time.time() - start_time) / 60
    mean_rmse, std_rmse = np.mean(all_rmse), np.std(all_rmse)
    print(f"üèÅ Completed {len(seeds)} seeds in {total_min:.2f} min")
    print(f"üìâ Overall CV RMSE: {mean_rmse:.4f} ¬± {std_rmse:.4f}\n")

    if verbose:
        print(f"üèÖ Top {n_top} Averaged Features:")
        display(avg_imp.head(n_top))

    # =========================================================
    # ‚úÖ Return structured results
    # =========================================================
    return {
        "all_models": all_models,
        "oof_preds": oof_preds,
        "all_rmse": all_rmse,
        "mean_rmse": mean_rmse,
        "std_rmse": std_rmse,
        "feature_importance_avg": avg_imp,
        "features": features,
        "runtime_min": total_min,
        "device_used": device_type,
    }

### XGB

In [21]:
def run_xgboost_multi_seed(
    df,
    seeds=range(5),
    n_splits=5,
    n_top=25,
    verbose=True
):
    """
    ‚ö° XGBoost Multi-Seed CV Trainer (GPU-adaptive, Stratified, OOF-enabled)
    ------------------------------------------------------------------------
    - StratifiedKFold using discrete essay score bins (0.5‚Äì6.0)
    - Produces Out-Of-Fold predictions for stacking
    - Automatically detects and uses GPU if available
    - Returns models, OOF preds, importances, and stats
    """
    import xgboost as xgb
    import numpy as np, pandas as pd, gc, torch, time, warnings
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import mean_squared_error
    from IPython.display import display

    warnings.filterwarnings("ignore")
    start_time = time.time()

    # =========================================================
    # ‚öôÔ∏è Device detection
    # =========================================================
    device_type = "gpu" if torch.cuda.is_available() else "cpu"
    tree_method = "gpu_hist" if device_type == "gpu" else "hist"
    print(f"üíª Using {device_type.upper()} ({torch.cuda.get_device_name(0) if device_type=='gpu' else 'CPU only'})\n")

    # =========================================================
    # üßπ Data prep
    # =========================================================
    df = df.copy()
    y = df["score"].astype(float).values
    X = (
        df.drop(columns=["id", "score", "essay_text"], errors="ignore")
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    features = X.columns.tolist()
    n_samples, n_features = X.shape
    print(f"üìä Loaded: {n_samples:,} samples √ó {n_features:,} features\n")

    if n_features == 0:
        raise ValueError("‚ùå No valid features found for XGBoost training.")

    # =========================================================
    # üîß Parameters
    # =========================================================
    params = dict(
        objective="reg:squarederror",
        eval_metric="rmse",
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        reg_alpha=0.1,
        tree_method=tree_method,
        verbosity=0,
    )

    # =========================================================
    # üöÄ Multi-seed Stratified CV
    # =========================================================
    all_rmse, all_models, all_importances = [], [], []
    oof_preds = np.zeros(len(X))
    print(f"üöÄ Starting XGBoost CV: {len(seeds)} seeds √ó {n_splits}-folds (stratified)\n")

    # Discrete score bins
    y_bins = (y * 2).astype(int)

    for s_i, seed in enumerate(seeds, 1):
        seed_start = time.time()
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        fold_rmse, fold_models = [], []
        print(f"üå± Seed {seed} ({s_i}/{len(seeds)})")

        for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y_bins), 1):
            fold_start = time.time()

            dtrain = xgb.DMatrix(X.iloc[tr_idx], label=y[tr_idx], feature_names=features)
            dval   = xgb.DMatrix(X.iloc[va_idx], label=y[va_idx], feature_names=features)

            model = xgb.train(
                params,
                dtrain,
                num_boost_round=5000,
                evals=[(dval, "valid")],
                early_stopping_rounds=100,
                verbose_eval=200 if verbose else False
            )

            preds = model.predict(dval, iteration_range=(0, model.best_iteration))
            oof_preds[va_idx] += preds / len(seeds)

            rmse = mean_squared_error(y[va_idx], preds, squared=False)
            fold_rmse.append(rmse)
            fold_models.append(model)

            print(f"   ‚úÖ Fold {fold}/{n_splits}: RMSE={rmse:.4f} | BestIter={model.best_iteration} | ‚è± {(time.time()-fold_start):.1f}s")

        mean_rmse, std_rmse = np.mean(fold_rmse), np.std(fold_rmse)
        all_rmse.append(mean_rmse)
        all_models.extend(fold_models)

        # Aggregate feature importances
        fold_importance = {}
        for m in fold_models:
            for f, imp in m.get_score(importance_type="gain").items():
                fold_importance[f] = fold_importance.get(f, 0) + imp
        imp_df = pd.DataFrame(
            {"feature": list(fold_importance.keys()),
             "importance": np.array(list(fold_importance.values())) / len(fold_models),
             "seed": seed}
        )
        all_importances.append(imp_df)

        print(f"üåæ Seed {seed} done ‚Üí RMSE={mean_rmse:.4f} ¬± {std_rmse:.4f} | ‚è± {(time.time()-seed_start)/60:.2f} min\n")
        gc.collect()

    # =========================================================
    # üßÆ Aggregate importances
    # =========================================================
    avg_imp = (
        pd.concat(all_importances)
        .groupby("feature", as_index=False)["importance"]
        .mean()
        .sort_values("importance", ascending=False)
        .reset_index(drop=True)
    )

    total_min = (time.time() - start_time) / 60
    mean_rmse, std_rmse = np.mean(all_rmse), np.std(all_rmse)
    print(f"üèÅ Completed {len(seeds)} seeds in {total_min:.2f} min")
    print(f"üìâ Overall CV RMSE: {mean_rmse:.4f} ¬± {std_rmse:.4f}\n")

    if verbose:
        print(f"üèÖ Top {n_top} Averaged Features:")
        display(avg_imp.head(n_top))

    return {
        "all_models": all_models,
        "oof_preds": oof_preds,
        "all_rmse": all_rmse,
        "mean_rmse": mean_rmse,
        "std_rmse": std_rmse,
        "feature_importance_avg": avg_imp,
        "features": features,
        "runtime_min": total_min,
        "device_used": device_type,
    }

### CB

In [22]:
def run_catboost_multi_seed(
    df,
    seeds=range(5),
    n_splits=5,
    n_top=25,
    verbose=True
):
    """
    üêà CatBoost Multi-Seed CV Trainer (GPU-adaptive, Stratified, OOF-enabled)
    ------------------------------------------------------------------------
    - StratifiedKFold on discrete essay score bins (0.5‚Äì6.0)
    - Out-Of-Fold predictions for stacking
    - Auto GPU/CPU detection
    - Aggregates feature importances across seeds
    """
    from catboost import CatBoostRegressor, Pool
    import numpy as np, pandas as pd, gc, torch, time, warnings
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import mean_squared_error
    from IPython.display import display

    warnings.filterwarnings("ignore")
    start_time = time.time()

    # =========================================================
    # ‚öôÔ∏è Detect device
    # =========================================================
    device_type = "GPU" if torch.cuda.is_available() else "CPU"
    print(f"üíª Using {device_type}")

    # =========================================================
    # üßπ Data prep
    # =========================================================
    df = df.copy()
    y = df["score"].astype(float).values
    X = (
        df.drop(columns=["id", "score", "essay_text"], errors="ignore")
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    features = X.columns.tolist()
    n_samples, n_features = X.shape
    print(f"üìä Loaded: {n_samples:,} samples √ó {n_features:,} features\n")

    if n_features == 0:
        raise ValueError("‚ùå No valid features found for CatBoost training.")

    # Bin labels for stratified folds
    y_bins = (y * 2).astype(int)

    # =========================================================
    # üöÄ Multi-seed Stratified CV
    # =========================================================
    all_rmse, all_models, all_importances = [], [], []
    oof_preds = np.zeros(len(X))

    print(f"üöÄ Starting CatBoost CV: {len(seeds)} seeds √ó {n_splits}-folds (stratified)\n")

    for s_i, seed in enumerate(seeds, 1):
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        fold_rmse, fold_models = [], []
        print(f"üå± Seed {seed} ({s_i}/{len(seeds)})")

        for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y_bins), 1):
            fold_start = time.time()

            train_pool = Pool(X.iloc[tr_idx], label=y[tr_idx])
            val_pool   = Pool(X.iloc[va_idx], label=y[va_idx])

            # ‚úÖ Fixed: removed rsm & subsample (not GPU-supported for RMSE)
            model = CatBoostRegressor(
                task_type=device_type,
                loss_function="RMSE",
                learning_rate=0.01,
                depth=6,
                iterations=5000,
                l2_leaf_reg=3,
                random_seed=seed,
                early_stopping_rounds=100,
                verbose=0
            )

            model.fit(train_pool, eval_set=val_pool, verbose=200 if verbose else False)

            preds = model.predict(val_pool)
            oof_preds[va_idx] += preds / len(seeds)

            rmse = mean_squared_error(y[va_idx], preds, squared=False)
            fold_rmse.append(rmse)
            fold_models.append(model)

            print(f"   ‚úÖ Fold {fold}/{n_splits}: RMSE={rmse:.4f} | BestIter={model.get_best_iteration()} | ‚è± {(time.time()-fold_start):.1f}s")

        mean_rmse, std_rmse = np.mean(fold_rmse), np.std(fold_rmse)
        all_rmse.append(mean_rmse)
        all_models.extend(fold_models)

        imp_df = pd.DataFrame({
            "feature": features,
            "importance": model.get_feature_importance(),
            "seed": seed
        })
        all_importances.append(imp_df)
        gc.collect()

        print(f"üåæ Seed {seed} done ‚Üí RMSE={mean_rmse:.4f} ¬± {std_rmse:.4f}\n")

    # =========================================================
    # üßÆ Aggregate importances
    # =========================================================
    avg_imp = (
        pd.concat(all_importances)
        .groupby("feature", as_index=False)["importance"]
        .mean()
        .sort_values("importance", ascending=False)
        .reset_index(drop=True)
    )

    total_min = (time.time() - start_time) / 60
    mean_rmse, std_rmse = np.mean(all_rmse), np.std(all_rmse)
    print(f"üèÅ Completed {len(seeds)} seeds in {total_min:.2f} min")
    print(f"üìâ Overall CV RMSE: {mean_rmse:.4f} ¬± {std_rmse:.4f}\n")

    if verbose:
        print(f"üèÖ Top {n_top} Averaged Features:")
        display(avg_imp.head(n_top))

    return {
        "all_models": all_models,
        "oof_preds": oof_preds,
        "all_rmse": all_rmse,
        "mean_rmse": mean_rmse,
        "std_rmse": std_rmse,
        "feature_importance_avg": avg_imp,
        "features": features,
        "runtime_min": total_min,
        "device_used": device_type,
    }

## 6.2 Multi-Seed Training

In [23]:
'''
import time

SEEDS, FOLDS = range(1), 2
df_train = df_train_full.copy()

print("‚ö° Starting multi-model training...\n")
start_all = time.time()

res_lgb = run_lightgbm_multi_seed(df_train, seeds=SEEDS, n_splits=FOLDS, n_top=15)
res_xgb = run_xgboost_multi_seed(df_train,   seeds=SEEDS, n_splits=FOLDS, n_top=15)
res_cat = run_catboost_multi_seed(df_train,  seeds=SEEDS, n_splits=FOLDS, n_top=15)

print("\nüèÅ Training complete!")
print(f"üìâ LGBM RMSE: {res_lgb['mean_rmse']:.4f}")
print(f"üìâ XGB  RMSE: {res_xgb['mean_rmse']:.4f}")
print(f"üìâ CAT  RMSE: {res_cat['mean_rmse']:.4f}")
print(f"‚è± Total time: {(time.time() - start_all)/60:.2f} min")

imp_lgb = res_lgb['feature_importance_avg']
imp_xgb = res_xgb['feature_importance_avg']
imp_cat = res_cat['feature_importance_avg']

display(imp_lgb.head(10))
display(imp_xgb.head(10))
display(imp_cat.head(10))

TOP_N = 25
top25_features_lgbm = imp_lgb.head(TOP_N)['feature'].tolist()
top25_features_xgb  = imp_xgb.head(TOP_N)['feature'].tolist()
top25_features_cb   = imp_cat.head(TOP_N)['feature'].tolist()

print("LGB top25:", len(top25_features_lgbm))
print("XGB top25:", len(top25_features_xgb))
print("CAT top25:", len(top25_features_cb))
'''

'\nimport time\n\nSEEDS, FOLDS = range(1), 2\ndf_train = df_train_full.copy()\n\nprint("‚ö° Starting multi-model training...\n")\nstart_all = time.time()\n\nres_lgb = run_lightgbm_multi_seed(df_train, seeds=SEEDS, n_splits=FOLDS, n_top=15)\nres_xgb = run_xgboost_multi_seed(df_train,   seeds=SEEDS, n_splits=FOLDS, n_top=15)\nres_cat = run_catboost_multi_seed(df_train,  seeds=SEEDS, n_splits=FOLDS, n_top=15)\n\nprint("\nüèÅ Training complete!")\nprint(f"üìâ LGBM RMSE: {res_lgb[\'mean_rmse\']:.4f}")\nprint(f"üìâ XGB  RMSE: {res_xgb[\'mean_rmse\']:.4f}")\nprint(f"üìâ CAT  RMSE: {res_cat[\'mean_rmse\']:.4f}")\nprint(f"‚è± Total time: {(time.time() - start_all)/60:.2f} min")\n\nimp_lgb = res_lgb[\'feature_importance_avg\']\nimp_xgb = res_xgb[\'feature_importance_avg\']\nimp_cat = res_cat[\'feature_importance_avg\']\n\ndisplay(imp_lgb.head(10))\ndisplay(imp_xgb.head(10))\ndisplay(imp_cat.head(10))\n\nTOP_N = 25\ntop25_features_lgbm = imp_lgb.head(TOP_N)[\'feature\'].tolist()\ntop25_featur

## Save top 25 features for each (running purposes)

In [24]:
# ===========================================
# üíæ Save top-25 feature names for each model
# ===========================================

top25_features_lgbm_saved = [
    'final_word_count', 'max_cursor', 'num_words', 'count_input', 'q_tc_count',
    'mean_cursor', 'commas_per_sentence', 'any_internal_punct_share',
    'commas_per_100_words', 'spaces_before_punct_per_100_tokens', 'max_word_count',
    'q_overall_delta', 'body_para_mean_len', 'avg_internal_punct_per_sentence',
    'words_per_second', 'deb_emb_424', 'time_per_word', 'std_cursor',
    'multi_clause_sent_share', 'deb_emb_609', 'double_spaces_after_eos_per_100_sentences',
    'non_move_count', 'std_word_count', 'mean_sentence_len', 'dashes_count'
]

top25_features_xgb_saved = [
    'final_word_count', 'num_words', 'max_cursor', 'max_word_count', 'q_tc_count',
    'count_input', 'std_cursor', 'commas_per_sentence', 'any_internal_punct_share',
    'commas_per_100_words', 'avg_internal_punct_per_sentence',
    'spaces_before_punct_per_100_tokens', 'time_per_word', 'mean_cursor',
    'spaces_before_punct_total', 'q_overall_delta', 'std_word_count',
    'deb_emb_125', 'deb_emb_424', 'deb_emb_736', 'words_per_second',
    'spaces_before_comma', 'time_per_event', 'deb_emb_609', 'deb_emb_695'
]

top25_features_cb_saved = [
    'max_cursor', 'final_word_count', 'max_word_count', 'commas_per_sentence',
    'q_tc_count', 'num_words', 'num_paragraphs', 'q_overall_delta',
    'any_internal_punct_share', 'std_cursor', 'count_input',
    'spaces_before_punct_per_100_tokens', 'commas_per_100_words', 'deb_emb_424',
    'body_para_mean_len', 'total_events', 'avg_internal_punct_per_sentence',
    'words_per_second', 'time_per_word', 'spaces_before_comma',
    'mean_sentence_len', 'heavy_internal_punct_sentence_share', 'dashes_count',
    'multi_clause_sent_share', 'non_move_count'
]

print(f"‚úÖ Saved top-25 feature lists:")
print(f"   LGBM ‚Üí {len(top25_features_lgbm_saved)} features")
print(f"   XGB  ‚Üí {len(top25_features_xgb_saved)} features")
print(f"   CAT  ‚Üí {len(top25_features_cb_saved)} features")

‚úÖ Saved top-25 feature lists:
   LGBM ‚Üí 25 features
   XGB  ‚Üí 25 features
   CAT  ‚Üí 25 features


## 6.3 Retrain Final Models

In [25]:
# ============================================================
# STAGE 5 ‚Äî RETRAIN FINAL MODELS
# ============================================================

import joblib, gc
import numpy as np, pandas as pd
from sklearn.metrics import mean_squared_error

print("üöÄ Retraining final (top-N) models on full training data...")

df_train = df_train_full.copy()
# --- Helper to restrict columns safely
def select_features(df_train, feats):
    return df_train.loc[:, [f for f in feats if f in df_train.columns]]

# --- Common clean-up
X_full = df_train.drop(columns=['id','score','essay_text'], errors='ignore')
X_full = X_full.replace([np.inf,-np.inf], np.nan).fillna(0)
y_full = df_train['score'].values

# ============================================================
# ‚úÖ LightGBM Final
# ============================================================
from lightgbm import LGBMRegressor

X_lgb = select_features(X_full, top25_features_lgbm_saved)
final_lgb = LGBMRegressor(
    objective='regression',
    learning_rate=0.01,
    num_leaves=31,
    n_estimators=3000,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    bagging_freq=5,
    random_state=42,
    device='gpu' if torch.cuda.is_available() else 'cpu'
)
final_lgb.fit(X_lgb, y_full)
joblib.dump(final_lgb, "/kaggle/working/final_lgbm.pkl")
print("üíæ Saved final_lgbm.pkl")

# ============================================================
# ‚úÖ XGBoost Final
# ============================================================
import xgboost as xgb

X_xgb = select_features(X_full, top25_features_xgb_saved)
final_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.01,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.9,
    n_estimators=3000,
    random_state=42,
    tree_method='gpu_hist' if torch.cuda.is_available() else 'hist'
)
final_xgb.fit(X_xgb, y_full)
joblib.dump(final_xgb, "/kaggle/working/final_xgb.pkl")
print("üíæ Saved final_xgb.pkl")

# ============================================================
# ‚úÖ CatBoost Final
# ============================================================
from catboost import CatBoostRegressor

X_cb = select_features(X_full, top25_features_cb_saved)
final_cb = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.01,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=False,
    task_type='GPU' if torch.cuda.is_available() else 'CPU'
)
final_cb.fit(X_cb, y_full)
joblib.dump(final_cb, "/kaggle/working/final_cb.pkl")
print("üíæ Saved final_cb.pkl")

gc.collect()
print("\n‚úÖ Stage 5 complete ‚Äî all three final models retrained and saved.\n")

üöÄ Retraining final (top-N) models on full training data...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 5895
[LightGBM] [Info] Number of data points in the train set: 2471, number of used features: 25
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.06 MB) transferred to GPU in 0.000703 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.711251
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.04 MB) transferred to GPU in 0.000672 secs. 1 sparse feature groups
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.04 MB) transferred to GPU in 0.000727 secs. 1 sparse feature groups
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.04 MB) transferred to GPU in 0.000752 secs. 1 sparse feature groups
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.05 MB) transferred to GPU in 0.000722 secs. 1 sparse feature groups
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



üíæ Saved final_xgb.pkl
üíæ Saved final_cb.pkl

‚úÖ Stage 5 complete ‚Äî all three final models retrained and saved.



## 6.4 Evaluate Each Model

In [26]:
# ============================================================
# STAGE 6A ‚Äî EVALUATE FINAL MODELS
# ============================================================

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

def cross_val_rmse(model, X, y, folds=5):
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    rmses = []
    for train_idx, val_idx in kf.split(X):
        model.fit(X.iloc[train_idx], y[train_idx])
        preds = model.predict(X.iloc[val_idx])
        rmses.append(mean_squared_error(y[val_idx], preds, squared=False))
    return np.mean(rmses), np.std(rmses)

for name, model, feats in [
    ("LightGBM", final_lgb, top25_features_lgbm_saved),
    ("XGBoost",  final_xgb, top25_features_xgb_saved),
    ("CatBoost", final_cb,  top25_features_cb_saved)
]:
    X_eval = select_features(X_full, feats)
    mean_rmse, std_rmse = cross_val_rmse(model, X_eval, y_full)
    print(f"{name}: RMSE = {mean_rmse:.4f} ¬± {std_rmse:.4f}")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 5866
[LightGBM] [Info] Number of data points in the train set: 1976, number of used features: 25
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.05 MB) transferred to GPU in 0.000657 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.709008
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.04 MB) transferred to GPU in 0.000633 secs. 1 sparse feature groups
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.04 MB) transferred to GPU in 0.000666 secs. 1 sparse feature groups
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 23 dense feature groups (0.04 MB)


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



XGBoost: RMSE = 0.6690 ¬± 0.0344
CatBoost: RMSE = 0.6315 ¬± 0.0377


## 6.5 Blend Ensemble

In [27]:
# ============================================================
# STAGE 6B ‚Äî BLEND ENSEMBLE (RIDGE BLENDER)
# ============================================================

from sklearn.linear_model import RidgeCV

print("\nüöÄ Building RidgeCV meta-blender...")

# Generate out-of-sample predictions for full train
p_l = final_lgb.predict(select_features(X_full, top25_features_lgbm_saved))
p_x = final_xgb.predict(select_features(X_full, top25_features_xgb_saved))
p_c = final_cb.predict(select_features(X_full, top25_features_cb_saved))

X_blend = np.vstack([p_l, p_x, p_c]).T
ridge = RidgeCV(alphas=np.logspace(-3, 3, 20), cv=5)
ridge.fit(X_blend, y_full)

print("‚úÖ Meta-blender weights:")
for m, w in zip(["LGBM","XGB","CAT"], ridge.coef_):
    print(f"   {m}: {w:.4f}")
print(f"Intercept: {ridge.intercept_:.4f}")

joblib.dump(ridge, "/kaggle/working/final_blender.pkl")
print("üíæ Saved final_blender.pkl\n")


üöÄ Building RidgeCV meta-blender...
‚úÖ Meta-blender weights:
   LGBM: 0.5226
   XGB: 0.5498
   CAT: -0.0672
Intercept: -0.0225
üíæ Saved final_blender.pkl



## 6.6 Save Models

In [28]:
import os
print("üìÇ Models saved:")
for f in os.listdir("/kaggle/working"):
    if f.endswith(".pkl"):
        print("  ", f)

üìÇ Models saved:
   final_xgb.pkl
   final_lgbm.pkl
   final_blender.pkl
   final_cb.pkl


# 7 Testing

## 7.1 Overview

In [29]:
print("üßæ Inspecting df_test_logs:")
display(df_test_logs.head())
display(df_test_logs.shape)
display(df_test_logs.dtypes)

with pd.option_context('display.float_format', lambda x: f'{x:.3f}'):
    display(df_test_logs.describe())

üßæ Inspecting df_test_logs:


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0


(6, 11)

id                 object
event_id            int64
down_time           int64
up_time             int64
action_time         int64
activity           object
down_event         object
up_event           object
text_change        object
cursor_position     int64
word_count          int64
dtype: object

Unnamed: 0,event_id,down_time,up_time,action_time,cursor_position,word_count
count,6.0,6.0,6.0,6.0,6.0,6.0
mean,1.5,486917.833,486990.333,72.5,0.5,0.5
std,0.548,244593.651,244605.302,19.149,0.548,0.548
min,1.0,184996.0,185052.0,46.0,0.0,0.0
25%,1.0,302484.75,302540.5,58.75,0.0,0.0
50%,1.5,486990.0,487079.5,76.0,0.5,0.5
75%,2.0,692853.75,692927.5,86.5,1.0,1.0
max,2.0,760073.0,760160.0,94.0,1.0,1.0


## 7.2 Section 2

In [30]:
# ==========================================================
#  STEP 1 ‚Üí ANALYSE RAW DATA (Test Set)
# ==========================================================
print("Step 1: Analysing data...")
df_test_logs_analysis = df_test_logs.copy()
analyse_data(df_test_logs_analysis)
print(f"‚Üí Shape after Step 1: {df_test_logs_analysis.shape}")

# ==========================================================
#  STEP 2 ‚Üí TRANSFORM DATA (Test Set)
# ==========================================================
print("\nStep 2: Transforming columns...")
df_test_logs_transformed = transform_data(df_test_logs_analysis.copy())
print(f"‚Üí Shape after Step 2: {df_test_logs_transformed.shape}")

# ==========================================================
#  STEP 3 ‚Üí CLEAN DATA (Test Set)
# ==========================================================
print("\nStep 3: Cleaning data...")
df_test_logs_cleaned = clean_data(df_test_logs_transformed.copy())
print(f"‚Üí Shape after Step 3: {df_test_logs_cleaned.shape}")

# ==========================================================
#  STEP 4 ‚Üí AGGREGATE EVENT-LEVEL FEATURES (Essay-Level) (Test Set)
# ==========================================================
print("\nStep 4: Aggregating event-level features...")
df_test_agg_logs = aggregate_data(df_test_logs_cleaned.copy())
print(f"‚Üí Shape after Step 4: {df_test_agg_logs.shape}")

# ==========================================================
#  FINAL SUMMARY (Test Set)
# ==========================================================
print("\n‚úÖ Preprocessing pipeline completed successfully for the test set!")
print(f"Final dataset shape: {df_test_agg_logs.shape}")
display(df_test_agg_logs.head(3))

Step 1: Analysing data...
üìä ANALYSING DATAFRAME

‚úÖ No missing values found.

üß© Checking for inconsistent data types...
‚úÖ All columns have consistent data types.

‚úÖ No negative values in numeric columns.

üì¶ Distinct values summary:
min    1.0
max    6.0
dtype: float64
üîπ Top 10 columns by unique count:
down_time          6
action_time        6
up_time            6
id                 3
event_id           2
down_event         2
up_event           2
cursor_position    2
text_change        2
word_count         2
dtype: int64

üìù Sample entries from text columns:
‚Ä¢ id: ['0000aaaa' '2222bbbb' '4444cccc']
‚Ä¢ activity: ['Input']
‚Ä¢ down_event: ['Space' 'q']
‚Ä¢ up_event: ['Space' 'q']
‚Ä¢ text_change: [' ' 'q']

üìã Summary:
Rows: 6 | Columns: 11
Analysis complete.

‚Üí Shape after Step 1: (6, 11)

Step 2: Transforming columns...
üîß Transforming dataset...

‚úÖ Sample transformed 'activity_trf':


Unnamed: 0,activity,activity_trf
0,Input,Input


‚úÖ Sample transformed 'down_event' & 'up_event':


Unnamed: 0,down_event,down_event_trf,up_event,up_event_trf
0,Space,Space,Space,Space


Distinct transformed event types:
{'down_event_trf': 2, 'up_event_trf': 2}

‚úÖ Sample transformed 'text_change_trf':


Unnamed: 0,text_change,text_change_trf
0,,space



üìã Transformation complete!
Rows: 6 | Columns: 15
New columns added: activity_trf, down_event_trf, up_event_trf, text_change_trf

‚Üí Shape after Step 2: (6, 15)

Step 3: Cleaning data...
üßπ Cleaning data...
‚úÖ Cleaned 9 text columns.
üìã Sample after cleaning:


Unnamed: 0,id,activity,down_event,up_event,text_change,activity_trf,down_event_trf,up_event_trf,text_change_trf
0,0000aaaa,input,space,space,,input,space,space,space


Rows: 6 | Columns: 15

‚Üí Shape after Step 3: (6, 15)

Step 4: Aggregating event-level features...
üßÆ Aggregating essay-level behavioral features...
‚úÖ Data sorted and grouped by 'id'.
Rows: 6 | Columns: 15
üïí Computed basic timing and event features.


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000aaaa,2,338433,760160,421727,86.0,172


üß≠ Added move/non-move activity stats (1 activity types).


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_input
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0000aaaa,2,338433,760160,421727,86.0,172,2,0,2


üìè Computed move distance statistics.


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_input,sum_move_distance,mean_move_distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0000aaaa,2,338433,760160,421727,86.0,172,2,0,2,0.0,0.0


‚úèÔ∏è Extracted text-change and q-delta features.


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_input,sum_move_distance,mean_move_distance,non_q_tc_count,q_tc_count,q_overall_delta
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0000aaaa,2,338433,760160,421727,86.0,172,2,0,2,0.0,0.0,2,0,0


üñ±Ô∏è Added cursor and word count stats.


Unnamed: 0_level_0,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_input,sum_move_distance,...,non_q_tc_count,q_tc_count,q_overall_delta,mean_cursor,std_cursor,max_cursor,final_word_count,max_word_count,min_word_count,std_word_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000aaaa,2,338433,760160,421727,86.0,172,2,0,2,0.0,...,2,0,0,0.5,0.707107,1,0,0,0,0.0


‚öôÔ∏è Derived higher-level behavioral ratios.


Unnamed: 0,id,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_input,...,words_per_second,edit_intensity,revision_ratio,net_char_change_ratio,q_activity_ratio,cursor_movement_intensity,avg_move_distance,word_var_ratio,time_per_word,time_per_event
0,0000aaaa,2,338433,760160,421727,86.0,172,2,0,2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,421727.0,210863.5



‚úÖ Aggregation complete!
Final shape: 3 rows √ó 33 columns
‚Üí Shape after Step 4: (3, 33)

‚úÖ Preprocessing pipeline completed successfully for the test set!
Final dataset shape: (3, 33)


Unnamed: 0,id,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_input,...,words_per_second,edit_intensity,revision_ratio,net_char_change_ratio,q_activity_ratio,cursor_movement_intensity,avg_move_distance,word_var_ratio,time_per_word,time_per_event
0,0000aaaa,2,338433,760160,421727,86.0,172,2,0,2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,421727.0,210863.5
1,2222bbbb,2,290502,712023,421521,56.5,113,2,0,2,...,2e-06,0.0,0.0,2.0,1.0,0.0,0.0,0.0,421521.0,210760.5
2,4444cccc,2,184996,635641,450645,75.0,150,2,0,2,...,2e-06,0.0,0.0,1.0,1.0,0.0,0.0,0.707107,450645.0,225322.5


## 7.3 Section 3

In [31]:
# ==========================================================
# üöÄ FULL ESSAY RECONSTRUCTION + FEATURE ENRICHMENT PIPELINE (Test)
# ==========================================================

# ‚úÖ Step 0: Start from original logs
df_test_logs_copy = df_test_logs.copy()
print("üìò Step 0: Original df_test_logs shape:", df_test_logs_copy.shape)

# ‚úÖ Step 1: Essay reconstruction
df_test_recon_logs_raw = getEssays(df_test_logs_copy.copy()).to_frame(name='essay_text')
df_test_recon_logs_raw.index.name = 'id'
df_test_recon_logs_raw.reset_index(inplace=True)  # ensure 'id' is a proper column
print("‚úÖ Step 1: Essays reconstructed ‚Äî shape:", df_test_recon_logs_raw.shape)

# ==========================================================
# ‚úÖ Step 2: Unified feature enrichment (parallelized)
# ==========================================================
print("‚öôÔ∏è Step 2: Extracting full linguistic + structural + mechanics features (parallelized)...")
df_test_recon_logs = enrich_full_text_features_parallel(df_test_recon_logs_raw.copy(), show_preview=False)
print("‚úÖ Step 2: Feature enrichment complete ‚Äî shape:", df_test_recon_logs.shape)

# ==========================================================
# ‚úÖ Step 3: Check for duplicate columns
# ==========================================================
dupes = df_test_recon_logs.columns[df_test_recon_logs.columns.duplicated()]

if len(dupes) > 0:
    from collections import Counter
    dupe_counts = Counter(dupes)
    print(f"\n‚ö†Ô∏è Found {len(dupe_counts)} duplicate column names:")
    for name, count in list(dupe_counts.items())[:15]:
        print(f"   üß© {name} ‚Üí appears {count} times")
    if len(dupe_counts) > 15:
        print("   ... (truncated)")
    
    # Drop duplicates (keep first occurrence)
    before = df_test_recon_logs.shape[1]
    df_test_recon_logs = df_test_recon_logs.loc[:, ~df_test_recon_logs.columns.duplicated()]
    after = df_test_recon_logs.shape[1]
    print(f"üßπ Removed {before - after} duplicate columns. Final shape: {df_test_recon_logs.shape}")
else:
    print("\n‚úÖ No duplicate columns detected in df_test_recon_logs.")

# ==========================================================
# ‚úÖ SUMMARY
# ==========================================================
print("\nüéØ Pipeline complete! Final dataset ‚Üí df_test_recon_logs")
print(f"üß© Step 0: df_test_logs_copy shape: {df_test_logs_copy.shape}")
print(f"üß© Step 1: df_test_recon_logs_raw shape: {df_test_recon_logs_raw.shape}")
print(f"üß© Step 2: df_test_recon_logs (final) shape: {df_test_recon_logs.shape}")

# üß† Sanity check
print("üß† Total essays:", df_test_recon_logs.shape[0])
print("üß© Total new features:", df_test_recon_logs.shape[1] - 2)  # exclude id + essay_text

# ‚úÖ Optional preview
display(df_test_recon_logs.head(2)[['id', 'essay_text'] + df_test_recon_logs.columns[2:12].tolist()])

# (Optional) Save for reuse
# df_test_recon_logs.to_csv("/kaggle/working/df_test_recon_logs.csv", index=False)
# print("üíæ Saved df_test_recon_logs.csv")

üìò Step 0: Original df_test_logs shape: (6, 11)
üß† Reconstructing 3 essays...


Processing essays: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 1045.87it/s]


üìù First reconstructed essay (ID: 0000aaaa):



-----------------------------------------------

‚úÖ Step 1: Essays reconstructed ‚Äî shape: (3, 2)
‚öôÔ∏è Step 2: Extracting full linguistic + structural + mechanics features (parallelized)...
‚ÑπÔ∏è swifter not installed ‚Äî using normal .apply() (slower)
‚úÖ Step 2: Feature enrichment complete ‚Äî shape: (3, 82)

‚ö†Ô∏è Found 1 duplicate column names:
   üß© num_paragraphs ‚Üí appears 1 times
üßπ Removed 1 duplicate columns. Final shape: (3, 81)

üéØ Pipeline complete! Final dataset ‚Üí df_test_recon_logs
üß© Step 0: df_test_logs_copy shape: (6, 11)
üß© Step 1: df_test_recon_logs_raw shape: (3, 2)
üß© Step 2: df_test_recon_logs (final) shape: (3, 81)
üß† Total essays: 3
üß© Total new features: 79



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,id,essay_text,num_words,num_sentences,num_paragraphs,mean_sentence_len,std_sentence_len,cv_sentence_len,short_sent_share,long_sent_share,avg_sent_per_para,var_sent_per_para
0,0000aaaa,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2222bbbb,qq,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


## 7.4 Section 4

In [32]:
# ==========================================================
#  STEP: Add DeBERTa Embeddings to Reconstructed Essays (Test)
# ==========================================================

print("‚öôÔ∏è  Applying DeBERTa embeddings to test essays...")

# Apply to df_test_recon_logs
df_test_recon_D_logs = add_deberta_embeddings(df_test_recon_logs.copy())

# ‚úÖ Verification
print("\n‚úÖ Embedding process complete!")
print("üìä Final DataFrame shape:", df_test_recon_D_logs.shape)
print("üß© Sample of new columns added:")
print([col for col in df_test_recon_D_logs.columns if col.startswith("deb_emb_")][:10])

# Optional ‚Äî sanity check for alignment
id_check = df_test_recon_D_logs["id"].equals(df_test_recon_logs["id"])
print(f"üîç ID alignment check passed? {id_check}")

# Preview
display(df_test_recon_D_logs.head(2))

‚öôÔ∏è  Applying DeBERTa embeddings to test essays...

üîß Loading DeBERTa model from: /kaggle/input/deberta-v3-fast-tokenizer/deb-v3
üíª Device: cuda

üìù Embedding 3 essays | max_len=256, batch_size=8
‚úÖ Done! Added 768 embedding columns.
üìä Output shape: (3, 849)

üîç Preview of first 2 rows and first 5 embedding dims:


Unnamed: 0,id,essay_text,deb_emb_0,deb_emb_1,deb_emb_2,deb_emb_3,deb_emb_4
0,0000aaaa,,0.109927,0.361136,0.028249,-0.09224,0.037945
1,2222bbbb,qq,0.487315,0.380985,-0.114131,-0.442624,-0.417851



‚úÖ Embedding process complete!
üìä Final DataFrame shape: (3, 849)
üß© Sample of new columns added:
['deb_emb_0', 'deb_emb_1', 'deb_emb_2', 'deb_emb_3', 'deb_emb_4', 'deb_emb_5', 'deb_emb_6', 'deb_emb_7', 'deb_emb_8', 'deb_emb_9']
üîç ID alignment check passed? True


Unnamed: 0,id,essay_text,num_words,num_sentences,num_paragraphs,mean_sentence_len,std_sentence_len,cv_sentence_len,short_sent_share,long_sent_share,...,deb_emb_758,deb_emb_759,deb_emb_760,deb_emb_761,deb_emb_762,deb_emb_763,deb_emb_764,deb_emb_765,deb_emb_766,deb_emb_767
0,0000aaaa,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.05591,0.053146,-0.015212,0.133863,0.064338,-0.236399,0.026995,-0.039859,0.210456,-0.001108
1,2222bbbb,qq,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.31783,-0.103455,0.135854,-0.104323,-0.014378,-0.054632,0.019773,-0.146049,0.088022,-0.199441


## 7.5 Section 5

In [33]:
# Merge the two DataFrames on 'id' first
df_test_full_unchecked = pd.merge(df_test_agg_logs, df_test_recon_D_logs, on="id", how="left")

# Apply the prepare_dataframe function to the merged DataFrame
df_test_full = prepare_dataframe(df_test_full_unchecked.copy(), target_col=None)  # No target column for merged features

# ‚úÖ Final check and summary
print(f"\nüéØ Final merged and checked df_test_full shape: {df_test_full.shape}")
display(df_test_full.head(2))

üßπ Stage 0: Data Preparation & Sanity Checks
‚ö†Ô∏è Detected issues in feature columns:
   üï≥Ô∏è NaNs: 2 total, in 1 columns.
      ‚Ü≥ Top NaN columns (count):
         - list_like_colon_sentence_share_all: 2
‚Ü™Ô∏è  Cleaning features: replacing inf ‚Üí NaN ‚Üí 0

‚úÖ Data ready. Shape: (3, 881)
üî¢ Features (excl. protected cols): 879
üÜî Unique IDs: 3  |  Rows: 3

üéØ Final merged and checked df_test_full shape: (3, 881)


Unnamed: 0,id,total_events,writing_start,writing_end,total_time_spent_on_essay,mean_action_time,sum_action_time,non_move_count,move_count,count_input,...,deb_emb_758,deb_emb_759,deb_emb_760,deb_emb_761,deb_emb_762,deb_emb_763,deb_emb_764,deb_emb_765,deb_emb_766,deb_emb_767
0,0000aaaa,2,338433,760160,421727,86.0,172,2,0,2,...,-0.05591,0.053146,-0.015212,0.133863,0.064338,-0.236399,0.026995,-0.039859,0.210456,-0.001108
1,2222bbbb,2,290502,712023,421521,56.5,113,2,0,2,...,0.31783,-0.103455,0.135854,-0.104323,-0.014378,-0.054632,0.019773,-0.146049,0.088022,-0.199441


In [34]:
import os
import numpy as np
import pandas as pd

def make_submission_final(model_type="lgbm"):
    """
    Generate the final submission file for a selected model type.
    
    Parameters
    ----------
    model_type : str
        One of ['lgbm', 'xgb', 'cat', 'blend'].
        - 'lgbm'  ‚Üí uses final_lgb + top25_features_lgbm_saved
        - 'xgb'   ‚Üí uses final_xgb + top25_features_xgb_saved
        - 'cat'   ‚Üí uses final_cb  + top25_features_cb_saved
        - 'blend' ‚Üí blends the three (ridge-based)
    
    Produces:
        /kaggle/working/submission.csv
    """

    valid_models = ["lgbm", "xgb", "cat", "blend"]
    if model_type not in valid_models:
        raise ValueError(f"‚ùå Invalid model_type '{model_type}'. Must be one of {valid_models}.")

    print(f"\nüöÄ Generating submission for model: **{model_type.upper()}**")

    # ==========================================================
    # 1Ô∏è‚É£ Individual models
    # ==========================================================
    if model_type in ["lgbm", "xgb", "cat"]:
        model_map = {
            "lgbm": (final_lgb, top25_features_lgbm_saved),
            "xgb":  (final_xgb, top25_features_xgb_saved),
            "cat":  (final_cb,  top25_features_cb_saved),
        }
        model, feats = model_map[model_type]

        # Prepare test data
        Xtest = df_test_full.drop(columns=['id', 'essay_text'], errors='ignore')
        Xtest = select_features(Xtest, feats).replace([np.inf, -np.inf], np.nan).fillna(0)

        # Get predictions
        preds = model.predict(Xtest)

        # Save submission as submission.csv
        sub = pd.DataFrame({
            "id": df_test_full['id'],
            "score": np.clip(preds, 0.5, 6.0)
        })
        sub.to_csv(f"/kaggle/working/submission.csv", index=False)
        print(f"üíæ Saved submission as /kaggle/working/submission.csv")

    # ==========================================================
    # 2Ô∏è‚É£ Ensemble blend
    # ==========================================================
    elif model_type == "blend":
        print("‚öôÔ∏è Generating ensemble blend...")

        # Check if the individual submission files exist, if not, generate them
        blend_files = [
            "/kaggle/working/submission_lgbm.csv",
            "/kaggle/working/submission_xgb.csv",
            "/kaggle/working/submission_cat.csv"
        ]

        # Check and create individual submissions if they don't exist
        for i, file in enumerate(blend_files):
            model_name = ['lgbm', 'xgb', 'cat'][i]
            if not os.path.exists(file):
                print(f"‚ö†Ô∏è {file} does not exist. Creating {file}...")
                make_submission_final(model_type=model_name)  # Recurse to generate the missing submission file

        # After ensuring the individual submissions exist, load them
        p_l = pd.read_csv("/kaggle/working/submission_lgbm.csv")['score'].values
        p_x = pd.read_csv("/kaggle/working/submission_xgb.csv")['score'].values
        p_c = pd.read_csv("/kaggle/working/submission_cat.csv")['score'].values

        # Combine individual predictions for blending
        P = np.vstack([p_l, p_x, p_c]).T
        p_blend = ridge.predict(P)

        # Create and save blended submission as submission.csv
        sub_blend = pd.DataFrame({
            "id": df_test_full['id'],
            "score": np.clip(p_blend, 0.5, 6.0)
        })
        sub_blend.to_csv("/kaggle/working/submission.csv", index=False)
        print(f"üíæ Saved blended submission as /kaggle/working/submission.csv")

    # ==========================================================
    # 3Ô∏è‚É£ Invalid model_type
    # ==========================================================
    else:
        print(f"‚ùå Invalid model_type: {model_type}. Please choose from 'lgbm', 'xgb', 'cat', or 'blend'.")

    print(f"\nüíæ Submission process completed for {model_type.upper()}.")

In [35]:
make_submission_final(model_type="lgbm")  # For LGBM model


üöÄ Generating submission for model: **LGBM**
üíæ Saved submission as /kaggle/working/submission.csv

üíæ Submission process completed for LGBM.


In [36]:
'''# --- Build submission (id must be string) ---
submission = pd.DataFrame({
    "id": X_test.index.astype(str),
    "score": test_preds
})

# --- Sanity checks ---
assert submission["id"].nunique() == len(submission), "Duplicate IDs in submission."
assert np.isfinite(submission["score"]).all(), "Found NaN/inf in scores."
# Optional: clip if your target is bounded
# submission["score"] = submission["score"].clip(1.0, 6.0)

print(submission.head())
print("‚úÖ submission shape:", submission.shape)

# --- Save for Kaggle grader ---
# submission.to_csv("/kaggle/working/submission.csv", index=False)
# print("üíæ wrote /kaggle/working/submission.csv")'''

'# --- Build submission (id must be string) ---\nsubmission = pd.DataFrame({\n    "id": X_test.index.astype(str),\n    "score": test_preds\n})\n\n# --- Sanity checks ---\nassert submission["id"].nunique() == len(submission), "Duplicate IDs in submission."\nassert np.isfinite(submission["score"]).all(), "Found NaN/inf in scores."\n# Optional: clip if your target is bounded\n# submission["score"] = submission["score"].clip(1.0, 6.0)\n\nprint(submission.head())\nprint("‚úÖ submission shape:", submission.shape)\n\n# --- Save for Kaggle grader ---\n# submission.to_csv("/kaggle/working/submission.csv", index=False)\n# print("üíæ wrote /kaggle/working/submission.csv")'