In [2]:
# ================
# 1. Libraries
# ================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [3]:
# ================
# 2. Load Dataset
# ================
train_df = pd.read_csv('/kaggle/input/features-ai-vs-human/train_features_updated_SNR.csv')
test_df = pd.read_csv('/kaggle/input/features-ai-vs-human/test_features_updated_SNR.csv')

In [25]:
# ================
# 3. Feature Engineering
# ================
important_features = [
    'SNR', 'LaplacianVar', 'EdgeDensity',
    'SatVariance', 'HueVariance', 'HighFreqEnergy'
]

for df in [train_df, test_df]:
    df['SatEdgeRatio'] = df['SatVariance'] / (df['EdgeDensity'] + 1e-5)
    df['log_SNR'] = np.log(df['SNR'] + 1e-5)

feature_cols = important_features + ['SatEdgeRatio', 'log_SNR']

# Select features - keep as pandas DataFrame
X = train_df[feature_cols]
y = train_df['label'].astype(int)
X_test = test_df[feature_cols]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [26]:
# ================
# 4. Scaling
# ================
X = X.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Fill NaNs with column means
X = X.fillna(X.mean())
X_test = X_test.fillna(X.mean()) 

scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [27]:
# ================
# 5. Train-Validation Split
# ================
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [29]:
# ================
# 6. Logistic Regression
# ================
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)
val_preds_logreg = logreg_model.predict_proba(X_val)[:, 1]

In [30]:
# ================
# 7. XGBoost
# ================
xgb_model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc'
)

xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=50)
val_preds_xgb = xgb_model.predict_proba(X_val)[:, 1]


[0]	validation_0-auc:0.72114




[50]	validation_0-auc:0.73424
[100]	validation_0-auc:0.73746
[150]	validation_0-auc:0.74057
[200]	validation_0-auc:0.74344
[250]	validation_0-auc:0.74619
[300]	validation_0-auc:0.74897
[350]	validation_0-auc:0.75087
[400]	validation_0-auc:0.75281
[450]	validation_0-auc:0.75444
[500]	validation_0-auc:0.75556
[550]	validation_0-auc:0.75653
[600]	validation_0-auc:0.75730
[650]	validation_0-auc:0.75798
[700]	validation_0-auc:0.75850
[750]	validation_0-auc:0.75896
[800]	validation_0-auc:0.75935
[850]	validation_0-auc:0.75974
[900]	validation_0-auc:0.76016
[950]	validation_0-auc:0.76041
[999]	validation_0-auc:0.76062


In [31]:
# ================
# 8. Neural Network
# ================
nn_model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_val, y_val), verbose=2)
val_preds_nn = nn_model.predict(X_val).flatten()

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


1000/1000 - 4s - 4ms/step - accuracy: 0.6717 - loss: 0.6511 - val_accuracy: 0.6782 - val_loss: 0.6090
Epoch 2/30
1000/1000 - 2s - 2ms/step - accuracy: 0.6803 - loss: 0.6204 - val_accuracy: 0.6837 - val_loss: 0.6008
Epoch 3/30
1000/1000 - 2s - 2ms/step - accuracy: 0.6813 - loss: 0.6107 - val_accuracy: 0.6855 - val_loss: 0.6181
Epoch 4/30
1000/1000 - 2s - 2ms/step - accuracy: 0.6839 - loss: 0.6141 - val_accuracy: 0.6886 - val_loss: 0.5935
Epoch 5/30
1000/1000 - 2s - 2ms/step - accuracy: 0.6856 - loss: 0.6244 - val_accuracy: 0.6906 - val_loss: 0.5988
Epoch 6/30
1000/1000 - 2s - 2ms/step - accuracy: 0.6886 - loss: 0.6029 - val_accuracy: 0.6891 - val_loss: 0.6333
Epoch 7/30
1000/1000 - 2s - 2ms/step - accuracy: 0.6876 - loss: 0.6310 - val_accuracy: 0.6919 - val_loss: 0.6639
Epoch 8/30
1000/1000 - 2s - 2ms/step - accuracy: 0.6884 - loss: 0.6020 - val_accuracy: 0.6942 - val_loss: 0.5876
Epoch 9/30
1000/1000 - 2s - 2ms/step - accuracy: 0.6877 - loss: 0.6094 - val_accuracy: 0.6944 - val_loss: 0

In [32]:
# ================
# 9. Optimize Blend Weights
# ================
blend_weights = np.linspace(0, 1, 101)
best_f1 = 0
best_w1, best_w2, best_w3 = 0, 0, 0

for w1 in blend_weights:
    for w2 in blend_weights:
        if w1 + w2 > 1:
            continue
        w3 = 1 - w1 - w2
        blended_preds = (w1 * val_preds_logreg) + (w2 * val_preds_xgb) + (w3 * val_preds_nn)
        score = f1_score(y_val, (blended_preds > 0.5).astype(int))
        if score > best_f1:
            best_f1 = score
            best_w1, best_w2, best_w3 = w1, w2, w3

print(f"✅ Best Blend Weights: Logistic {best_w1:.2f}, XGB {best_w2:.2f}, NN {best_w3:.2f}")

✅ Best Blend Weights: Logistic 0.09, XGB 0.07, NN 0.84


In [33]:
# ================
# 10. Optimize Final Threshold
# ================
final_blended_val_preds = (best_w1 * val_preds_logreg) + (best_w2 * val_preds_xgb) + (best_w3 * val_preds_nn)

best_thresh = 0.5
best_thresh_score = 0

for thresh in np.linspace(0.3, 0.7, 101):
    preds_binary = (final_blended_val_preds > thresh).astype(int)
    score = f1_score(y_val, preds_binary)
    if score > best_thresh_score:
        best_thresh_score = score
        best_thresh = thresh

print(f"✅ Best Final Threshold: {best_thresh:.4f} with F1: {best_thresh_score:.5f}")

✅ Best Final Threshold: 0.3600 with F1: 0.72322


In [34]:
# ================
# 11. Predict on Test Set
# ================
test_preds_logreg = logreg_model.predict_proba(X_test)[:, 1]
test_preds_xgb = xgb_model.predict_proba(X_test)[:, 1]
test_preds_nn = nn_model.predict(X_test).flatten()

blended_test_preds = (best_w1 * test_preds_logreg) + (best_w2 * test_preds_xgb) + (best_w3 * test_preds_nn)

final_test_labels = (blended_test_preds > best_thresh).astype(int)


[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [36]:
# ================
# 12. Create Final Submission
# ================
submission = pd.DataFrame({
    'id': test_df['id'],
    'label': final_test_labels
})

submission['id'] = submission['id'].apply(lambda x: f'test_data_v2/{x}')
submission.to_csv('/kaggle/working/final_submission_XG_NN_LG.csv', index=False)

print("🎯 Final Submission File Created Successfully!")

🎯 Final Submission File Created Successfully!
