In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/enigma26/Engima26_Dataset/test.xlsx
/kaggle/input/enigma26/Engima26_Dataset/train.xlsx
/kaggle/input/enigma26/Engima26_Dataset/target.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import HistGradientBoostingRegressor

# 1. Load Data
# Assuming files are in the specified Kaggle paths
train_df = pd.read_excel('/kaggle/input/enigma26/Engima26_Dataset/train.xlsx')
test_df = pd.read_excel('/kaggle/input/enigma26/Engima26_Dataset/test.xlsx')
target_df = pd.read_csv('/kaggle/input/enigma26/Engima26_Dataset/target.csv')

# Combine profiles to ensure consistent encoding
full_profiles = pd.concat([train_df, test_df], axis=0).drop_duplicates('Profile_ID')

# 2. Advanced Preprocessing
def parse_list(x):
    if pd.isna(x) or str(x).strip() == "":
        return set()
    # Normalize to lowercase and split by semicolon
    return set(str(x).lower().strip().split(';'))

# Pre-parse list columns into sets for fast set operations
for col in ['Business_Interests', 'Business_Objectives', 'Constraints']:
    full_profiles[col + '_set'] = full_profiles[col].apply(parse_list)

# Encode Categorical metadata
cat_cols = ['Gender', 'Role', 'Seniority_Level', 'Industry', 'Location_City']
for col in cat_cols:
    le = LabelEncoder()
    full_profiles[col + '_enc'] = le.fit_transform(full_profiles[col].astype(str))

profile_map = full_profiles.set_index('Profile_ID')

# 3. Enhanced Feature Engineering Function
def get_advanced_features(pairs, profiles):
    src_ids = pairs['src_user_id'].values
    dst_ids = pairs['dst_user_id'].values
    
    src_data = profiles.loc[src_ids]
    dst_data = profiles.loc[dst_ids]
    
    features = pd.DataFrame()
    
    # Basic Numerical & Categorical Matches
    features['age_diff'] = np.abs(src_data['Age'].values - dst_data['Age'].values)
    for col in cat_cols:
        features[f'same_{col}'] = (src_data[f'{col}_enc'].values == dst_data[f'{col}_enc'].values).astype(int)
    
    # Set-based logical interactions
    list_cols = ['Business_Interests', 'Business_Objectives', 'Constraints']
    
    for col in list_cols:
        s_sets = src_data[col + '_set'].values
        d_sets = dst_data[col + '_set'].values
        
        # Intersection and Jaccard
        inter_size = np.array([len(s & d) for s, d in zip(s_sets, d_sets)])
        union_size = np.array([len(s | d) if len(s | d) > 0 else 1 for s, d in zip(s_sets, d_sets)])
        
        features[f'{col}_inter'] = inter_size
        features[f'{col}_jaccard'] = inter_size / union_size
        features[f'{col}_len_sum'] = np.array([len(s) + len(d) for s, d in zip(s_sets, d_sets)])

    # Cross-Column Logic (e.g., Objectives vs Interests)
    # captures "Does what I want align with what you are interested in?"
    s_obj = src_data['Business_Objectives_set'].values
    d_int = dst_data['Business_Interests_set'].values
    features['obj_int_match'] = [len(s & d) for s, d in zip(s_obj, d_int)]
    
    # Objective vs Constraints (Problem Solving)
    d_con = dst_data['Constraints_set'].values
    features['obj_con_match'] = [len(s & d) for s, d in zip(s_obj, d_con)]
    
    return features

# 4. Train
print("Engineering features for training...")
X_train = get_advanced_features(target_df, profile_map)
y_train = target_df['compatibility_score'].values

print("Training high-precision model...")
# Using HistGradientBoosting for speed and modern boosting performance
model = HistGradientBoostingRegressor(
    max_iter=1000,
    learning_rate=0.05,
    max_depth=8,
    l2_regularization=0.1,
    random_state=42
)
model.fit(X_train, y_train)

# 5. Generate Test Submission (160,000 Pairs)
print("Generating 160,000 test pairs...")
test_ids = test_df['Profile_ID'].unique()
id_grid = pd.MultiIndex.from_product([test_ids, test_ids], names=['src_user_id', 'dst_user_id']).to_frame(index=False)

print("Engineering features for test set...")
X_test = get_advanced_features(id_grid, profile_map)

print("Predicting scores...")
preds = model.predict(X_test)

# 6. Final Formatting
submission = pd.DataFrame()
submission['ID'] = id_grid['src_user_id'].astype(int).astype(str) + "_" + id_grid['dst_user_id'].astype(int).astype(str)
submission['compatibility_score'] = np.clip(preds, 0, 1)

submission.to_csv('submission.csv', index=False)
print("Updated version complete. Total rows:", len(submission))

Engineering features for training...
Training high-precision model...
Generating 160,000 test pairs...
Engineering features for test set...
Predicting scores...
Updated version complete. Total rows: 160000
