In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/enigma26/Engima26_Dataset/test.xlsx
/kaggle/input/enigma26/Engima26_Dataset/train.xlsx
/kaggle/input/enigma26/Engima26_Dataset/target.csv


In [2]:
# =============================================================================
# üèÜ ENIGMA 2027 ‚Äî BUCKET + KNN HYBRID GENERATOR (FINAL CHAMPIONSHIP BUILD)
# =============================================================================
# GOAL:
#   ‚Ä¢ Near-zero MSE on training
#   ‚Ä¢ Strong private LB generalization
#   ‚Ä¢ Reverse discrete generator behavior
#
# STRATEGY:
#   1) Learn quantized score buckets from training data
#   2) Memory-based KNN with distance weighting
#   3) Hybrid confidence-based blending
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree
import os, re, time, warnings, gc

warnings.filterwarnings("ignore")
np.random.seed(42)

# =============================================================================
# CONFIG
# =============================================================================
DATA_DIR = "/kaggle/input/enigma26/Engima26_Dataset"

KNN_K = 3
FEATURE_WEIGHTS = np.array([1.0, 0.7, 0.7, 0.5])  # ALL, BI, BO, CO
BUCKET_DECIMALS = 4   # j_all rounding for bucket key
HYBRID_ALPHA = 0.7   # Bucket weight (0.7 bucket, 0.3 KNN)

print("=" * 80)
print("üèÜ ENIGMA 2027 ‚Äî BUCKET + KNN HYBRID GENERATOR")
print("=" * 80)
print("Using dataset:", DATA_DIR)

# =============================================================================
# HELPERS
# =============================================================================
def normalize_token(x):
    x = str(x).lower().strip()
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"[^\w\s]", "", x)
    return x.replace("&", "and")

def parse_set(val):
    if pd.isna(val) or str(val).strip() in ("", "nan"):
        return frozenset()
    return frozenset(normalize_token(t) for t in str(val).split(";") if t.strip())

def jaccard(a, b):
    if not a and not b:
        return 0.0
    u = a | b
    return len(a & b) / len(u) if u else 0.0

# =============================================================================
# LOAD DATA
# =============================================================================
train_df = pd.read_excel(f"{DATA_DIR}/train.xlsx")
test_df = pd.read_excel(f"{DATA_DIR}/test.xlsx")
target_df = pd.read_csv(f"{DATA_DIR}/target.csv")

print(f"Train: {len(train_df)} | Test: {len(test_df)} | Pairs: {len(target_df)}")

# =============================================================================
# PREPROCESS
# =============================================================================
for df in [train_df, test_df]:
    df["BI"] = df["Business_Interests"].apply(parse_set)
    df["BO"] = df["Business_Objectives"].apply(parse_set)
    df["CO"] = df["Constraints"].apply(parse_set)
    df["ALL"] = df.apply(lambda r: r["BI"] | r["BO"] | r["CO"], axis=1)

train_lookup = {r.Profile_ID: r for r in train_df.itertuples()}
test_lookup = {r.Profile_ID: r for r in test_df.itertuples()}

# =============================================================================
# BUILD TRAIN FEATURE MEMORY
# =============================================================================
print("\nBuilding feature memory...")

pairs = target_df[target_df.src_user_id != target_df.dst_user_id]
N = len(pairs)

X = np.zeros((N, 4), dtype=np.float32)
Y = np.zeros(N, dtype=np.float32)

for i, r in enumerate(pairs.itertuples()):
    u1 = train_lookup[r.src_user_id]
    u2 = train_lookup[r.dst_user_id]

    f_all = jaccard(u1.ALL, u2.ALL)
    f_bi = jaccard(u1.BI, u2.BI)
    f_bo = jaccard(u1.BO, u2.BO)
    f_co = jaccard(u1.CO, u2.CO)

    X[i] = [f_all, f_bi, f_bo, f_co]
    Y[i] = r.compatibility_score

# =============================================================================
# KNN MEMORY
# =============================================================================
XW = X * FEATURE_WEIGHTS
print("Building KDTree...")
tree = KDTree(XW)

# =============================================================================
# BUCKET SYSTEM
# =============================================================================
print("Learning score buckets...")

bucket_map = {}
for f, y in zip(X, Y):
    key = round(f[0], BUCKET_DECIMALS)  # j_all bucket
    if key not in bucket_map:
        bucket_map[key] = []
    bucket_map[key].append(y)

for k in bucket_map:
    bucket_map[k] = float(np.mean(bucket_map[k]))

bucket_keys = np.array(sorted(bucket_map.keys()))
BUCKET_EPS = np.std(Y) * 0.1  # Adaptive tolerance

# =============================================================================
# LOOKUPS
# =============================================================================
def bucket_lookup(j_all):
    idx = np.argmin(np.abs(bucket_keys - j_all))
    if abs(bucket_keys[idx] - j_all) <= BUCKET_EPS:
        return bucket_map[bucket_keys[idx]], True
    return None, False

def knn_lookup(f_vec):
    f = f_vec.reshape(1, -1) * FEATURE_WEIGHTS
    dist, idx = tree.query(f, k=KNN_K)

    d = dist[0] + 1e-6
    w = 1 / d
    return float(np.sum(w * Y[idx[0]]) / np.sum(w))

# =============================================================================
# HYBRID GENERATOR
# =============================================================================
def hybrid_generator(u1, u2):
    f_all = jaccard(u1.ALL, u2.ALL)
    f_bi = jaccard(u1.BI, u2.BI)
    f_bo = jaccard(u1.BO, u2.BO)
    f_co = jaccard(u1.CO, u2.CO)

    features = np.array([f_all, f_bi, f_bo, f_co])

    bucket_score, ok = bucket_lookup(f_all)
    knn_score = knn_lookup(features)

    if ok:
        return HYBRID_ALPHA * bucket_score + (1 - HYBRID_ALPHA) * knn_score
    return knn_score

# =============================================================================
# SELF SCORE
# =============================================================================
self_pairs = target_df[target_df.src_user_id == target_df.dst_user_id]
SELF_SCORE = float(self_pairs.compatibility_score.iloc[0]) if len(self_pairs) else 0.0

# =============================================================================
# TRAIN MSE CHECK
# =============================================================================
print("\nVerifying TRAIN MSE...")

train_preds = []
train_true = []

for r in pairs.itertuples():
    u1 = train_lookup[r.src_user_id]
    u2 = train_lookup[r.dst_user_id]
    train_preds.append(hybrid_generator(u1, u2))
    train_true.append(r.compatibility_score)

train_mse = np.mean((np.array(train_preds) - np.array(train_true)) ** 2)
print("üèÜ TRAIN MSE:", train_mse)

# =============================================================================
# GENERATE SUBMISSION
# =============================================================================
print("\nGenerating submission...")

ids = sorted(test_df.Profile_ID.unique())
results = []

start = time.time()

for i, src in enumerate(ids):
    u1 = test_lookup[src]
    for dst in ids:
        if src == dst:
            score = SELF_SCORE
        else:
            u2 = test_lookup[dst]
            score = hybrid_generator(u1, u2)

        results.append({
            "ID": f"{src}_{dst}",
            "compatibility_score": float(score)
        })

    if (i + 1) % 50 == 0:
        elapsed = time.time() - start
        eta = elapsed / (i + 1) * (len(ids) - i - 1)
        print(f"Progress: {i+1}/{len(ids)} | ETA: {eta:.1f}s")

submission = pd.DataFrame(results)
submission.to_csv("submission_hybrid.csv", index=False)

print("\nüèÜ DONE!")
print("File: submission_hybrid.csv")
print("Rows:", len(submission))
print("Time:", round(time.time() - start, 1), "seconds")

gc.collect()

üèÜ ENIGMA 2027 ‚Äî BUCKET + KNN HYBRID GENERATOR
Using dataset: /kaggle/input/enigma26/Engima26_Dataset
Train: 600 | Test: 400 | Pairs: 360000

Building feature memory...
Building KDTree...
Learning score buckets...

Verifying TRAIN MSE...
üèÜ TRAIN MSE: 0.0020865913598023043

Generating submission...
Progress: 50/400 | ETA: 122.3s
Progress: 100/400 | ETA: 101.0s
Progress: 150/400 | ETA: 85.1s
Progress: 200/400 | ETA: 67.9s
Progress: 250/400 | ETA: 49.9s
Progress: 300/400 | ETA: 33.4s
Progress: 350/400 | ETA: 16.6s
Progress: 400/400 | ETA: 0.0s

üèÜ DONE!
File: submission_hybrid.csv
Rows: 160000
Time: 133.2 seconds


1329

In [3]:
"""
======================================================================================
ENIGMA 2027 - ULTRA CHAMPIONSHIP SOLUTION v3.0
======================================================================================
üî• MAXIMUM OPTIMIZATION FOR LOWEST POSSIBLE MSE üî•

UPGRADES FROM v2.0:
  1. Multi-Model Ensemble (LightGBM + XGBoost + CatBoost + Ridge + MLP)
  2. Stacking with Meta-Learner
  3. 50+ Enhanced Features
  4. Hyperparameter Optimization
  5. Advanced Feature Engineering
  6. K-Fold Stacking for Robustness

======================================================================================
"""

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
import os
import re
import warnings
warnings.filterwarnings('ignore')

# Import all available ML libraries
MODELS_AVAILABLE = {}

try:
    import lightgbm as lgb
    MODELS_AVAILABLE['lgb'] = True
except:
    MODELS_AVAILABLE['lgb'] = False

try:
    import xgboost as xgb
    MODELS_AVAILABLE['xgb'] = True
except:
    MODELS_AVAILABLE['xgb'] = False

try:
    from catboost import CatBoostRegressor
    MODELS_AVAILABLE['catboost'] = True
except:
    MODELS_AVAILABLE['catboost'] = False

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor

print("=" * 70)
print("üî• ENIGMA 2027 - ULTRA CHAMPIONSHIP SOLUTION v3.0 üî•")
print("=" * 70)
print(f"\nModels available: {MODELS_AVAILABLE}")

# =============================================================================
# STEP 1: LOAD DATA
# =============================================================================
print("\n[1] Loading data...")

DATA_DIR = '/kaggle/input/enigma26/Engima26_Dataset'
if not os.path.exists(DATA_DIR):
    DATA_DIR = '.'
print(f"  ‚Üí Using: {DATA_DIR}")

train_df = pd.read_excel(f'{DATA_DIR}/train.xlsx')
test_df = pd.read_excel(f'{DATA_DIR}/test.xlsx')
target_df = pd.read_csv(f'{DATA_DIR}/target.csv')

print(f"  Train: {len(train_df)} users | Test: {len(test_df)} users | Pairs: {len(target_df)}")

# =============================================================================
# STEP 2: DOMAIN KNOWLEDGE MATRICES
# =============================================================================
print("\n[2] Building domain knowledge...")

ROLE_SYNERGY = {
    ('founder', 'investor'): 1.0, ('founder', 'mentor'): 0.95, ('founder', 'advisor'): 0.9,
    ('founder', 'engineer'): 0.8, ('founder', 'developer'): 0.8, ('investor', 'ceo'): 0.95,
    ('engineer', 'manager'): 0.85, ('developer', 'manager'): 0.85, ('cto', 'engineer'): 0.9,
    ('cto', 'developer'): 0.9, ('ceo', 'cto'): 0.9, ('ceo', 'cfo'): 0.85,
    ('sales', 'marketing'): 0.9, ('sales', 'product'): 0.8, ('marketing', 'product'): 0.85,
    ('consultant', 'executive'): 0.85, ('consultant', 'manager'): 0.8,
    ('analyst', 'manager'): 0.8, ('analyst', 'consultant'): 0.85,
    ('data', 'engineer'): 0.9, ('data', 'analyst'): 0.9,
    ('hr', 'manager'): 0.8, ('hr', 'executive'): 0.75,
}

INDUSTRY_CLUSTERS = {
    'tech': ['technology', 'software', 'saas', 'ai', 'fintech', 'edtech', 'healthtech', 'tech', 'it'],
    'finance': ['finance', 'banking', 'investment', 'fintech', 'insurance', 'financial'],
    'healthcare': ['healthcare', 'medical', 'biotech', 'pharma', 'healthtech', 'health'],
    'media': ['media', 'entertainment', 'content', 'gaming', 'digital'],
    'retail': ['retail', 'e-commerce', 'ecommerce', 'consumer', 'cpg'],
    'manufacturing': ['manufacturing', 'industrial', 'supply chain', 'logistics'],
}

def role_score(r1, r2):
    if not r1 or not r2 or pd.isna(r1) or pd.isna(r2): return 0.5
    r1, r2 = str(r1).lower().strip(), str(r2).lower().strip()
    if r1 == r2: return 0.7
    for (a, b), s in ROLE_SYNERGY.items():
        if (a in r1 and b in r2) or (b in r1 and a in r2): return s
    return 0.4

def industry_score(i1, i2):
    if not i1 or not i2 or pd.isna(i1) or pd.isna(i2): return 0.5
    i1, i2 = str(i1).lower(), str(i2).lower()
    if i1 == i2: return 1.0
    c1 = c2 = None
    for c, lst in INDUSTRY_CLUSTERS.items():
        if any(x in i1 for x in lst): c1 = c
        if any(x in i2 for x in lst): c2 = c
    if c1 and c2: return 0.85 if c1 == c2 else 0.35
    return 0.3

print("  ‚úì Enhanced role synergy & industry clusters")

# =============================================================================
# STEP 3: PREPROCESSING (ULTRA TEXT NORMALIZATION)
# =============================================================================
print("\n[3] Ultra text normalization...")

def normalize_token(x):
    x = x.lower().strip()
    x = re.sub(r'\s+', ' ', x)
    x = re.sub(r'[^\w\s]', '', x)  # Remove punctuation
    # Comprehensive synonyms
    synonyms = {
        'artificial intelligence': 'ai', 'machine learning': 'ml',
        'deep learning': 'dl', 'natural language processing': 'nlp',
        'computer vision': 'cv', 'data science': 'ds',
        'software as a service': 'saas', 'platform as a service': 'paas',
        'infrastructure as a service': 'iaas',
        'business to business': 'b2b', 'business to consumer': 'b2c',
        'research and development': 'rd', 'mergers and acquisitions': 'ma',
        'chief executive officer': 'ceo', 'chief technology officer': 'cto',
        'chief financial officer': 'cfo', 'chief operating officer': 'coo',
        'venture capital': 'vc', 'private equity': 'pe',
        'initial public offering': 'ipo',
    }
    for k, v in synonyms.items():
        x = x.replace(k, v)
    x = x.replace('&', 'and')
    return x

def parse_set(val):
    if pd.isna(val) or str(val) == 'nan': return set()
    return {normalize_token(x) for x in str(val).split(';') if x.strip() and x.strip() != 'nan'}

def preprocess(df):
    df = df.copy()
    df['BI'] = df['Business_Interests'].apply(parse_set)
    df['BO'] = df['Business_Objectives'].apply(parse_set)
    df['CO'] = df['Constraints'].apply(parse_set)
    df['ALL'] = df.apply(lambda r: r['BI'] | r['BO'] | r['CO'], axis=1)
    df['BI_BO'] = df.apply(lambda r: r['BI'] | r['BO'], axis=1)
    return df

train_p = preprocess(train_df)
test_p = preprocess(test_df)
print(f"  ‚úì Parsed {len(train_p)} train + {len(test_p)} test profiles")

# =============================================================================
# STEP 4: SIMILARITY FUNCTIONS
# =============================================================================
print("\n[4] Defining similarity functions...")

def jaccard(s1, s2):
    if not s1 and not s2: return 0.0
    return len(s1 & s2) / len(s1 | s2) if len(s1 | s2) > 0 else 0.0

def dice(s1, s2):
    if not s1 and not s2: return 0.0
    return 2 * len(s1 & s2) / (len(s1) + len(s2)) if (len(s1) + len(s2)) > 0 else 0.0

def overlap_coef(s1, s2):
    if not s1 or not s2: return 0.0
    return len(s1 & s2) / min(len(s1), len(s2)) if min(len(s1), len(s2)) > 0 else 0.0

def union_jaccard(r1, r2):
    return jaccard(r1['ALL'], r2['ALL'])

print("  ‚úì Jaccard, Dice, Overlap coefficient")

# =============================================================================
# STEP 5: FORMULA DISCOVERY
# =============================================================================
print("\n[5] Discovering optimal formula...")

self_pairs = target_df[target_df['src_user_id'] == target_df['dst_user_id']]
SELF_SCORE = self_pairs['compatibility_score'].iloc[0] if len(self_pairs) > 0 else 1.0
print(f"  Self-pairs: {len(self_pairs)}, score={SELF_SCORE}")

train_lookup = {r['Profile_ID']: r for _, r in train_p.iterrows()}

def test_formula(func):
    preds, acts = [], []
    for row in target_df.itertuples():
        if row.src_user_id == row.dst_user_id: continue
        preds.append(func(train_lookup[row.src_user_id], train_lookup[row.dst_user_id]))
        acts.append(row.compatibility_score)
    return mean_squared_error(acts, preds), np.array(preds), np.array(acts)

best_mse, best_func, best_name = float('inf'), None, ""
formulas = [("Union Jaccard", union_jaccard)]
for name, func in formulas:
    mse, preds, acts = test_formula(func)
    print(f"    {name}: MSE={mse:.10f}")
    if mse < best_mse:
        best_mse, best_func, best_name = mse, func, name
        formula_preds, y_train = preds, acts

print(f"\n  ‚òÖ WINNER: {best_name} (MSE={best_mse:.12f})")

# =============================================================================
# STEP 6: ULTRA FEATURE ENGINEERING (50+ Features)
# =============================================================================
print("\n[6] Ultra feature engineering (50+ features)...")

def extract_ultra_features(r1, r2):
    """Extract comprehensive feature set"""
    f = {}
    
    # === Jaccard-based features ===
    f['j_all'] = jaccard(r1['ALL'], r2['ALL'])
    f['j_bi'] = jaccard(r1['BI'], r2['BI'])
    f['j_bo'] = jaccard(r1['BO'], r2['BO'])
    f['j_co'] = jaccard(r1['CO'], r2['CO'])
    f['j_bi_bo'] = jaccard(r1['BI_BO'], r2['BI_BO'])
    
    # === Dice coefficient ===
    f['dice_all'] = dice(r1['ALL'], r2['ALL'])
    f['dice_bi'] = dice(r1['BI'], r2['BI'])
    f['dice_bo'] = dice(r1['BO'], r2['BO'])
    
    # === Overlap coefficient ===
    f['overlap_all'] = overlap_coef(r1['ALL'], r2['ALL'])
    f['overlap_bi'] = overlap_coef(r1['BI'], r2['BI'])
    
    # === Set size features ===
    f['bi_size_1'] = len(r1['BI'])
    f['bi_size_2'] = len(r2['BI'])
    f['bo_size_1'] = len(r1['BO'])
    f['bo_size_2'] = len(r2['BO'])
    f['co_size_1'] = len(r1['CO'])
    f['co_size_2'] = len(r2['CO'])
    f['all_size_1'] = len(r1['ALL'])
    f['all_size_2'] = len(r2['ALL'])
    
    # === Interaction features ===
    f['bi_inter'] = len(r1['BI'] & r2['BI'])
    f['bo_inter'] = len(r1['BO'] & r2['BO'])
    f['co_inter'] = len(r1['CO'] & r2['CO'])
    f['all_inter'] = len(r1['ALL'] & r2['ALL'])
    
    f['bi_union'] = len(r1['BI'] | r2['BI'])
    f['bo_union'] = len(r1['BO'] | r2['BO'])
    f['co_union'] = len(r1['CO'] | r2['CO'])
    f['all_union'] = len(r1['ALL'] | r2['ALL'])
    
    # === Ratio features ===
    f['overlap_ratio_1'] = len(r1['ALL'] & r2['ALL']) / (len(r1['ALL']) + 1e-6)
    f['overlap_ratio_2'] = len(r1['ALL'] & r2['ALL']) / (len(r2['ALL']) + 1e-6)
    f['size_ratio'] = min(len(r1['ALL']), len(r2['ALL'])) / (max(len(r1['ALL']), len(r2['ALL'])) + 1e-6)
    f['size_diff'] = abs(len(r1['ALL']) - len(r2['ALL']))
    f['size_sum'] = len(r1['ALL']) + len(r2['ALL'])
    f['size_product'] = len(r1['ALL']) * len(r2['ALL'])
    
    # === Cross-category features ===
    f['bi_bo_cross'] = len(r1['BI'] & r2['BO']) + len(r1['BO'] & r2['BI'])
    f['bi_co_cross'] = len(r1['BI'] & r2['CO']) + len(r1['CO'] & r2['BI'])
    f['bo_co_cross'] = len(r1['BO'] & r2['CO']) + len(r1['CO'] & r2['BO'])
    
    # === Asymmetry features ===
    f['asymmetry_bi'] = abs(len(r1['BI']) - len(r2['BI'])) / (len(r1['BI']) + len(r2['BI']) + 1e-6)
    f['asymmetry_all'] = abs(len(r1['ALL']) - len(r2['ALL'])) / (len(r1['ALL']) + len(r2['ALL']) + 1e-6)
    
    # === Demographic features ===
    if 'Age' in r1 and 'Age' in r2:
        a1 = r1.get('Age', 30) if pd.notna(r1.get('Age')) else 30
        a2 = r2.get('Age', 30) if pd.notna(r2.get('Age')) else 30
        f['age_diff'] = abs(a1 - a2)
        f['age_sum'] = a1 + a2
        f['age_min'] = min(a1, a2)
        f['age_max'] = max(a1, a2)
        f['age_ratio'] = min(a1, a2) / (max(a1, a2) + 1e-6)
    
    if 'Role' in r1 and 'Role' in r2:
        f['role_synergy'] = role_score(r1.get('Role'), r2.get('Role'))
        f['same_role'] = 1 if str(r1.get('Role', '')).lower() == str(r2.get('Role', '')).lower() else 0
    
    if 'Industry' in r1 and 'Industry' in r2:
        f['industry_align'] = industry_score(r1.get('Industry'), r2.get('Industry'))
        f['same_industry'] = 1 if str(r1.get('Industry', '')).lower() == str(r2.get('Industry', '')).lower() else 0
    
    if 'Location_City' in r1 and 'Location_City' in r2:
        f['same_city'] = 1 if str(r1.get('Location_City', '')).lower() == str(r2.get('Location_City', '')).lower() else 0
    
    if 'Seniority_Level' in r1 and 'Seniority_Level' in r2:
        f['same_seniority'] = 1 if str(r1.get('Seniority_Level', '')).lower() == str(r2.get('Seniority_Level', '')).lower() else 0
        seniority_map = {'junior': 1, 'mid': 2, 'senior': 3, 'executive': 4}
        s1 = seniority_map.get(str(r1.get('Seniority_Level', '')).lower(), 2)
        s2 = seniority_map.get(str(r2.get('Seniority_Level', '')).lower(), 2)
        f['seniority_diff'] = abs(s1 - s2)
    
    if 'Gender' in r1 and 'Gender' in r2:
        f['same_gender'] = 1 if str(r1.get('Gender', '')).lower() == str(r2.get('Gender', '')).lower() else 0
    
    if 'Company_Size_Employees' in r1 and 'Company_Size_Employees' in r2:
        cs1 = r1.get('Company_Size_Employees', 100) if pd.notna(r1.get('Company_Size_Employees')) else 100
        cs2 = r2.get('Company_Size_Employees', 100) if pd.notna(r2.get('Company_Size_Employees')) else 100
        f['company_size_ratio'] = min(cs1, cs2) / (max(cs1, cs2) + 1e-6)
        f['company_size_diff'] = abs(cs1 - cs2)
    
    # === Polynomial features (key interactions) ===
    f['j_all_sq'] = f['j_all'] ** 2
    f['j_bi_sq'] = f['j_bi'] ** 2
    f['j_all_j_bi'] = f['j_all'] * f['j_bi']
    f['j_all_j_bo'] = f['j_all'] * f['j_bo']
    f['j_bi_j_bo'] = f['j_bi'] * f['j_bo']
    
    return f

# Build training data
print("  Building training features...")
X_list, y_list = [], []
for row in target_df.itertuples():
    if row.src_user_id == row.dst_user_id: continue
    X_list.append(extract_ultra_features(train_lookup[row.src_user_id], train_lookup[row.dst_user_id]))
    y_list.append(row.compatibility_score)

X_train = pd.DataFrame(X_list)
y_train = np.array(y_list)
print(f"  ‚úì Features: {X_train.shape[1]} | Samples: {len(y_train)}")

# Scale features for neural network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# =============================================================================
# STEP 7: MULTI-MODEL TRAINING WITH STACKING
# =============================================================================
print("\n[7] Training multi-model ensemble with stacking...")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_predictions = {}
models = {}

# --- Model 1: LightGBM ---
if MODELS_AVAILABLE.get('lgb', False):
    print("  Training LightGBM...")
    lgb_params = {
        'n_estimators': 500,
        'max_depth': 8,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'min_child_samples': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': 42,
        'verbose': -1
    }
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    oof_lgb = cross_val_predict(lgb_model, X_train, y_train, cv=kf)
    lgb_model.fit(X_train, y_train)
    lgb_mse = mean_squared_error(y_train, oof_lgb)
    print(f"    LightGBM OOF MSE: {lgb_mse:.10f}")
    oof_predictions['lgb'] = oof_lgb
    models['lgb'] = lgb_model

# --- Model 2: XGBoost ---
if MODELS_AVAILABLE.get('xgb', False):
    print("  Training XGBoost...")
    xgb_params = {
        'n_estimators': 500,
        'max_depth': 8,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': 42,
        'verbosity': 0
    }
    xgb_model = xgb.XGBRegressor(**xgb_params)
    oof_xgb = cross_val_predict(xgb_model, X_train, y_train, cv=kf)
    xgb_model.fit(X_train, y_train)
    xgb_mse = mean_squared_error(y_train, oof_xgb)
    print(f"    XGBoost OOF MSE: {xgb_mse:.10f}")
    oof_predictions['xgb'] = oof_xgb
    models['xgb'] = xgb_model

# --- Model 3: CatBoost ---
if MODELS_AVAILABLE.get('catboost', False):
    print("  Training CatBoost...")
    cat_model = CatBoostRegressor(
        iterations=500,
        depth=8,
        learning_rate=0.05,
        random_state=42,
        verbose=0
    )
    oof_cat = cross_val_predict(cat_model, X_train, y_train, cv=kf)
    cat_model.fit(X_train, y_train)
    cat_mse = mean_squared_error(y_train, oof_cat)
    print(f"    CatBoost OOF MSE: {cat_mse:.10f}")
    oof_predictions['catboost'] = oof_cat
    models['catboost'] = cat_model

# --- Model 4: Gradient Boosting (sklearn) ---
print("  Training GradientBoosting...")
gb_model = GradientBoostingRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    random_state=42
)
oof_gb = cross_val_predict(gb_model, X_train, y_train, cv=kf)
gb_model.fit(X_train, y_train)
gb_mse = mean_squared_error(y_train, oof_gb)
print(f"    GradientBoosting OOF MSE: {gb_mse:.10f}")
oof_predictions['gb'] = oof_gb
models['gb'] = gb_model

# --- Model 5: Ridge Regression ---
print("  Training Ridge...")
ridge_model = Ridge(alpha=1.0)
oof_ridge = cross_val_predict(ridge_model, X_train_scaled, y_train, cv=kf)
ridge_model.fit(X_train_scaled, y_train)
ridge_mse = mean_squared_error(y_train, oof_ridge)
print(f"    Ridge OOF MSE: {ridge_mse:.10f}")
oof_predictions['ridge'] = oof_ridge
models['ridge'] = ridge_model

# --- Model 6: MLP Neural Network ---
print("  Training MLP...")
mlp_model = MLPRegressor(
    hidden_layer_sizes=(128, 64, 32),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate='adaptive',
    max_iter=500,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
)
oof_mlp = cross_val_predict(mlp_model, X_train_scaled, y_train, cv=kf)
mlp_model.fit(X_train_scaled, y_train)
mlp_mse = mean_squared_error(y_train, oof_mlp)
print(f"    MLP OOF MSE: {mlp_mse:.10f}")
oof_predictions['mlp'] = oof_mlp
models['mlp'] = mlp_model

# --- Model 7: Extra Trees ---
print("  Training ExtraTrees...")
et_model = ExtraTreesRegressor(
    n_estimators=300,
    max_depth=12,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)
oof_et = cross_val_predict(et_model, X_train, y_train, cv=kf)
et_model.fit(X_train, y_train)
et_mse = mean_squared_error(y_train, oof_et)
print(f"    ExtraTrees OOF MSE: {et_mse:.10f}")
oof_predictions['et'] = oof_et
models['et'] = et_model

# =============================================================================
# STEP 8: META-LEARNER STACKING
# =============================================================================
print("\n[8] Training meta-learner for stacking...")

# Create stacking features from OOF predictions
stack_train = np.column_stack([oof_predictions[k] for k in oof_predictions.keys()])
print(f"  Stacking features shape: {stack_train.shape}")

# Meta-learner: Ridge regression on OOF predictions
meta_model = Ridge(alpha=0.5)
oof_meta = cross_val_predict(meta_model, stack_train, y_train, cv=kf)
meta_model.fit(stack_train, y_train)
meta_mse = mean_squared_error(y_train, oof_meta)
print(f"  Meta-learner OOF MSE: {meta_mse:.10f}")

# Also try simple weighted average
print("\n  Finding optimal ensemble weights...")
best_ensemble_mse = float('inf')
best_weights = None

# Grid search for weights
model_names = list(oof_predictions.keys())
n_models = len(model_names)

# Simple averaging first
avg_pred = np.mean([oof_predictions[k] for k in model_names], axis=0)
avg_mse = mean_squared_error(y_train, avg_pred)
print(f"  Simple average MSE: {avg_mse:.10f}")

# Weighted by inverse MSE
model_mses = {k: mean_squared_error(y_train, oof_predictions[k]) for k in model_names}
inv_mses = {k: 1/(v + 1e-10) for k, v in model_mses.items()}
total_inv = sum(inv_mses.values())
opt_weights = {k: v/total_inv for k, v in inv_mses.items()}

weighted_pred = np.zeros(len(y_train))
for k, w in opt_weights.items():
    weighted_pred += w * oof_predictions[k]
weighted_mse = mean_squared_error(y_train, weighted_pred)
print(f"  Inverse-MSE weighted MSE: {weighted_mse:.10f}")

# Choose best
if meta_mse < min(avg_mse, weighted_mse):
    USE_META = True
    final_train_mse = meta_mse
    print(f"\n  ‚òÖ Using META-LEARNER (MSE={meta_mse:.10f})")
elif weighted_mse < avg_mse:
    USE_META = False
    final_train_mse = weighted_mse
    print(f"\n  ‚òÖ Using WEIGHTED ENSEMBLE (MSE={weighted_mse:.10f})")
else:
    USE_META = False
    opt_weights = {k: 1/n_models for k in model_names}
    final_train_mse = avg_mse
    print(f"\n  ‚òÖ Using SIMPLE AVERAGE (MSE={avg_mse:.10f})")

print(f"\n  Model weights: {opt_weights}")

# =============================================================================
# STEP 9: GENERATE TEST PREDICTIONS
# =============================================================================
print("\n[9] Generating test predictions...")

test_lookup = {r['Profile_ID']: r for _, r in test_p.iterrows()}
test_ids = sorted(test_p['Profile_ID'].unique().tolist())
print(f"  Test users: {len(test_ids)} | Pairs: {len(test_ids)**2}")

# Build test features
results = []
test_features = []

for i, src in enumerate(test_ids):
    if i % 100 == 0: print(f"  Processing {i+1}/{len(test_ids)}...")
    src_row = test_lookup[src]
    for dst in test_ids:
        dst_row = test_lookup[dst]
        
        if src == dst:
            computed = best_func(src_row, dst_row)
            results.append({'ID': f"{src}_{dst}", 'is_self': True, 'formula': max(SELF_SCORE, computed)})
        else:
            test_features.append(extract_ultra_features(src_row, dst_row))
            results.append({'ID': f"{src}_{dst}", 'is_self': False, 'formula': 0})

print("  Computing model predictions...")
X_test = pd.DataFrame(test_features)
X_test_scaled = scaler.transform(X_test)

# Get predictions from all models
test_predictions = {}
for name, model in models.items():
    if name in ['ridge', 'mlp']:
        test_predictions[name] = model.predict(X_test_scaled)
    else:
        test_predictions[name] = model.predict(X_test)

# Stack test predictions
stack_test = np.column_stack([test_predictions[k] for k in oof_predictions.keys()])

# Final prediction
if USE_META:
    final_test_preds = meta_model.predict(stack_test)
else:
    final_test_preds = np.zeros(len(X_test))
    for k, w in opt_weights.items():
        final_test_preds += w * test_predictions[k]

# Clip predictions
final_test_preds = np.clip(final_test_preds, 0, 1)

# =============================================================================
# STEP 10: CREATE SUBMISSION
# =============================================================================
print("\n[10] Creating submission...")

sub_df = pd.DataFrame(results)
non_self = ~sub_df['is_self']
sub_df.loc[non_self, 'compatibility_score'] = final_test_preds.round(4)
sub_df.loc[sub_df['is_self'], 'compatibility_score'] = sub_df.loc[sub_df['is_self'], 'formula'].round(4)

sub_df = sub_df[['ID', 'compatibility_score']]

# Validate
assert len(sub_df) == len(test_ids)**2
assert sub_df['compatibility_score'].between(0, 1).all()

print(f"  ‚úì Shape: {sub_df.shape}")
print(f"  ‚úì Score range: [{sub_df['compatibility_score'].min():.4f}, {sub_df['compatibility_score'].max():.4f}]")

# Check self-pairs
self_check = sub_df[sub_df['ID'].apply(lambda x: x.split('_')[0] == x.split('_')[1])]
print(f"  ‚úì Self-pairs: {len(self_check)}, score={self_check['compatibility_score'].iloc[0]}")

print("\n  Sample:")
print(sub_df.head(10).to_string(index=False))

sub_df.to_csv('submission.csv', index=False)
print("\n  ‚úì Saved: submission.csv")

# =============================================================================
# DONE
# =============================================================================
print("\n" + "=" * 70)
print("üî• ULTRA CHAMPIONSHIP SOLUTION COMPLETE! üî•")
print("=" * 70)
print(f"""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë  ULTRA WINNING CONFIGURATION                                         ‚ïë
‚ï†‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï£
‚ïë  ‚Ä¢ Models: {len(models)} (LGB/XGB/CAT/GB/Ridge/MLP/ET)                      ‚ïë
‚ïë  ‚Ä¢ Features: {X_train.shape[1]:<47} ‚ïë
‚ïë  ‚Ä¢ Stacking: {'META-LEARNER' if USE_META else 'WEIGHTED ENSEMBLE':<40} ‚ïë
‚ïë  ‚Ä¢ Final Train MSE: {final_train_mse:<40.10f} ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

MODEL PERFORMANCE:
""")
for name, mse in model_mses.items():
    print(f"   ‚Ä¢ {name:<12}: MSE={mse:.10f}")

print(f"""
ENSEMBLE WEIGHTS:
""")
for name, w in opt_weights.items():
    print(f"   ‚Ä¢ {name:<12}: {w:.4f}")

print(f"""
OUTPUT: submission.csv ({len(sub_df):,} rows)

üèÜ Good luck! This is maximum optimization! üèÜ
""")


üî• ENIGMA 2027 - ULTRA CHAMPIONSHIP SOLUTION v3.0 üî•

Models available: {'lgb': True, 'xgb': True, 'catboost': True}

[1] Loading data...
  ‚Üí Using: /kaggle/input/enigma26/Engima26_Dataset
  Train: 600 users | Test: 400 users | Pairs: 360000

[2] Building domain knowledge...
  ‚úì Enhanced role synergy & industry clusters

[3] Ultra text normalization...
  ‚úì Parsed 600 train + 400 test profiles

[4] Defining similarity functions...
  ‚úì Jaccard, Dice, Overlap coefficient

[5] Discovering optimal formula...
  Self-pairs: 600, score=0.0
    Union Jaccard: MSE=0.0025631469

  ‚òÖ WINNER: Union Jaccard (MSE=0.002563146948)

[6] Ultra feature engineering (50+ features)...
  Building training features...
  ‚úì Features: 57 | Samples: 359400

[7] Training multi-model ensemble with stacking...
  Training LightGBM...
    LightGBM OOF MSE: 0.0004368912
  Training XGBoost...
    XGBoost OOF MSE: 0.0003885447
  Training CatBoost...
    CatBoost OOF MSE: 0.0004403534
  Training GradientBoo