In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA

# Load both train and test
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Drop unwanted columns
drop_cols = ['id','composition_label_0','composition_label_1','publication_timestamp',
             'lunar_phase','creator_collective','composition_label_2','track_identifier']

df_train.drop(columns=drop_cols, inplace=True)
df_test.drop(columns=drop_cols, inplace=True)

# Separate features and target in train
target = df_train['target']
df_train.drop(columns=['target'], inplace=True)

# Identify numeric and categorical columns from train (important: use train only for this)
numeric_cols = df_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = df_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Split numeric cols by missingness threshold on train data
threshold = int(0.10 * len(df_train))  # 10%
numeric_null_counts = df_train[numeric_cols].isna().sum()

low_null_numeric = numeric_null_counts[numeric_null_counts <= threshold].index.tolist()
high_null_numeric = numeric_null_counts[numeric_null_counts > threshold].index.tolist()

# ---------- Numeric imputation ----------
# Fit KNN imputer on train low-missing numeric, transform both train and test
knn_imputer = KNNImputer(n_neighbors=3)
train_low_num_imputed = pd.DataFrame(
    knn_imputer.fit_transform(df_train[low_null_numeric]),
    columns=low_null_numeric
)
test_low_num_imputed = pd.DataFrame(
    knn_imputer.transform(df_test[low_null_numeric]),
    columns=low_null_numeric
)

# Fit SimpleImputer on train high-missing numeric, transform both train and test
simple_imputer = SimpleImputer(strategy='mean')
train_high_num_imputed = pd.DataFrame(
    simple_imputer.fit_transform(df_train[high_null_numeric]),
    columns=high_null_numeric
)
test_high_num_imputed = pd.DataFrame(
    simple_imputer.transform(df_test[high_null_numeric]),
    columns=high_null_numeric
)

# Combine numeric imputations for train and test
train_num_imputed = pd.concat([train_low_num_imputed, train_high_num_imputed], axis=1)
test_num_imputed = pd.concat([test_low_num_imputed, test_high_num_imputed], axis=1)

# ---------- Categorical imputation ----------
cat_imputer = SimpleImputer(strategy='most_frequent')
train_cat_imputed = pd.DataFrame(
    cat_imputer.fit_transform(df_train[categorical_cols]),
    columns=categorical_cols
)
test_cat_imputed = pd.DataFrame(
    cat_imputer.transform(df_test[categorical_cols]),
    columns=categorical_cols
)

# OneHotEncoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_cat_encoded = pd.DataFrame(
    encoder.fit_transform(train_cat_imputed),
    columns=encoder.get_feature_names_out(categorical_cols)
)
test_cat_encoded = pd.DataFrame(
    encoder.transform(test_cat_imputed),
    columns=encoder.get_feature_names_out(categorical_cols)
)

# Combine numeric + categorical features for train and test
train_processed = pd.concat([train_num_imputed.reset_index(drop=True), train_cat_encoded.reset_index(drop=True)], axis=1)
test_processed = pd.concat([test_num_imputed.reset_index(drop=True), test_cat_encoded.reset_index(drop=True)], axis=1)

# ---------- Scaling ----------
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_processed)
test_scaled = scaler.transform(test_processed)



# Final DataFrames
df_train_final = pd.DataFrame(train_scaled, columns=[f"PC{i+1}" for i in range(train_scaled.shape[1])])
df_train_final['target'] = target.reset_index(drop=True)

df_test_final = pd.DataFrame(test_scaled, columns=[f"PC{i+1}" for i in range(test_scaled.shape[1])])

# Now df_train_final and df_test_final are ready for modeling!


In [2]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import pandas as pd
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

# -------------------------------
# Step 1: Split data for evaluation
# -------------------------------
X = df_train_final.drop(columns=['target'])
y = df_train_final['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# -----------Run only for hyperparameter optimization---------------
# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#         'tree_method': 'gpu_hist'
#     }
    
#     model = XGBRegressor(**params, random_state=42)
#     rmse = -cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
#     return rmse

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=50)

# print("Best parameters:", study.best_params)


# -------------------------------
# Step 2: Define best hyperparameters and fit model
# -------------------------------
best_params = {
    'n_estimators': 405,
    'max_depth': 10,
    'learning_rate': 0.07748551072473618,
    'subsample': 0.8979101171513736,
    'colsample_bytree': 0.7371023660083749,
    'tree_method': 'gpu_hist',  # use 'gpu_hist' if you're using GPU
    'random_state': 42
}

xgb_model = XGBRegressor(**best_params)
xgb_model.fit(X, y)

# -------------------------------
# Step 3: Evaluate on validation set
# -------------------------------
val_preds = xgb_model.predict(X_val)
val_rmse = mean_squared_error(y_val, val_preds, squared=False)
val_mape = mean_absolute_percentage_error(y_val, val_preds)
val_r2 = r2_score(y_val, val_preds)

print("\n📊 XGBoost Validation Metrics:")
print(f" - RMSE: {val_rmse:.4f}")
print(f" - MAPE: {val_mape:.4f}")
print(f" - R²:   {val_r2:.4f}")

# -------------------------------
# Step 4: Predict on test set
# -------------------------------
test_preds = xgb_model.predict(df_test_final)

# -------------------------------
# Step 5: Save predictions
# -------------------------------
submission = pd.read_csv("sample_submission.csv")
submission['target'] = test_preds
submission.to_csv("xgb_submission.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm



📊 XGBoost Validation Metrics:
 - RMSE: 1.7990
 - MAPE: 0.0697
 - R²:   0.9931
