## Import the libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

## Load the dataframes

In [2]:
# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('testSubmissionFile.csv')

# Display first few rows
train.head()

Unnamed: 0,auctionId,timeStamp,placementId,websiteId,hashedRefererDeepThree,country,opeartingSystem,browser,browserVersion,device,environmentType,integrationType,articleSafenessCategorization,isSold
0,001ed16b-dd08-4599-b8ef-4f56a373c454_6e5f1087-...,1603815466,120706,68203,1ae7c2d3c28b711c072d8e2eb3869fa59090669bdc153e...,US,Windows,Chrome,86_0,PC,js-web,2,safe,False
1,0024b36a-4fb5-4070-88fb-fc0bfb1909ed,1603974586,69454,42543,df1108bf6ae49dbccf5eab60ff9d04a6a09dda60ec7290...,RO,Android,Facebook App,293_0,Phone,js-fbwv,1,unsafe,False
2,003630fa-ad63-4283-be1b-141670132d70_f37c2b23-...,1604229969,100170,57703,cc6957e8aec85a4d920991c53874c5d0780bbfbd469802...,UK,Android,Facebook App,294_0,Phone,js-web,2,safe,True
3,0048c65a-ce76-43ba-98d2-8e87607468f8,1604156610,100446,57797,7fc0bb7a65d074e003cce786cda2b070f80dd47179c4b9...,ES,Android,Chrome Mobile,86_0,Phone,js-ampsf,1,safe,True
4,0056b8a7-54f9-4ac8-8d50-f725bf377872,1604004493,119517,67613,3a6552ccbf66ad166aa9005c3e08f70716abd676cfd87b...,FR,Android,Facebook App,293_0,Phone,js-fbwv,1,unsafe,False


## Preprocessing

In [3]:
# Preprocessing
# Drop id and irrelevant features
train_df = train.copy()
test_df = test.copy()

# Save auctionId for submission
test_ids = test_df['auctionId']

# Drop identifiers
train_df = train_df.drop(['auctionId', 'hashedRefererDeepThree'], axis=1)
test_df = test_df.drop(['auctionId', 'hashedRefererDeepThree'], axis=1)

# Target
y = train_df['isSold']
X = train_df.drop('isSold', axis=1)

# Combine train and test for encoding
combined = pd.concat([X, test_df], axis=0, sort=False)

# One‐hot encoding
cols_to_encode = [
    'country',
    'opeartingSystem',
    'browser',
    'browserVersion',
    'device',
    'environmentType',
    'articleSafenessCategorization'
]
combined_enc = pd.get_dummies(
    combined,
    columns=cols_to_encode,
    dummy_na=False
)

# Split back
X_enc = combined_enc.iloc[:len(X), :].reset_index(drop=True)
test_enc = combined_enc.iloc[len(X):, :].reset_index(drop=True)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_enc, y, test_size=0.2, random_state=42)


## XGBoost

In [4]:
# # XGBoost Model
# xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42, verbose=0)
# xgb_model.fit(X_train, y_train)

# # Validation predictions
# xgb_val_pred = xgb_model.predict_proba(X_val)[:, 1]
# xgb_auc = roc_auc_score(y_val, xgb_val_pred)
# print(f'XGBoost ROC AUC: {xgb_auc:.4f}')

# # Train on full data
# xgb_model_full = xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42)
# xgb_model_full.fit(X_enc, y)

# # Predictions for submission
# xgb_test_pred = xgb_model_full.predict(test_enc)

# # Prepare submission
# xgb_submission = pd.DataFrame({'auctionId': test_ids, 'isSold': xgb_test_pred})
# xgb_submission.to_csv('submission_xgb.csv', index=False)


In [None]:
# Train on full data
xgb_model_tuned = xgb.XGBClassifier(
    learning_rate=0.01,         # très faible pour affiner
    n_estimators=5000,          # beaucoup d’arbres
    max_depth=10,               # arbres plus profonds
    min_child_weight=5,         # feuille doit contenir au moins 5 hessians
    gamma=0.2,                  # gain minimal pour split
    subsample=0.8,              # 80% des lignes par arbre
    colsample_bytree=0.8,       # 80% des colonnes par arbre
    reg_alpha=5,                # L1 régularisation assez forte
    reg_lambda=100,             # L2 régularisation très forte
    scale_pos_weight=1,         # ou ratio négatifs/positifs si déséquilibré
    max_bin=256,                # découpage très fin
    tree_method='hist',         # ou 'exact' si dataset plus petit
    random_state=42,
    verbosity=0
)
xgb_model_tuned.fit(X_enc, y)

# Predictions for submission
xgb_test_pred = xgb_model_tuned.predict(test_enc)

# Prepare submission
xgb_submission = pd.DataFrame({'auctionId': test_ids, 'isSold': xgb_test_pred})
xgb_submission.to_csv('submission_xgb.csv', index=False)

## LightGBM

In [None]:
# # LightGBM Model
# lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
# lgb_model.fit(X_train, y_train)

# # Validation predictions
# lgb_val_pred = lgb_model.predict_proba(X_val)[:, 1]
# lgb_auc = roc_auc_score(y_val, lgb_val_pred)
# print(f'LightGBM ROC AUC: {lgb_auc:.4f}')

# # Train on full data
# lgb_model_full = lgb.LGBMClassifier(random_state=42, verbose=-1)
# lgb_model_full.fit(X_enc, y)

# # Predictions for submission
# lgb_test_pred = lgb_model_full.predict(test_enc)

# # Prepare submission
# lgb_submission = pd.DataFrame({'auctionId': test_ids, 'isSold': lgb_test_pred})
# lgb_submission.to_csv('submission_lgb.csv', index=False)

LightGBM ROC AUC: 0.8245


## Random Forest

In [None]:
# # Random Forest Model
# rf_model = RandomForestClassifier(n_estimators=10, random_state=42)
# rf_model.fit(X_train, y_train)

# # Validation predictions
# rf_val_pred = rf_model.predict_proba(X_val)[:, 1]
# rf_auc = roc_auc_score(y_val, rf_val_pred)
# print(f'Random Forest ROC AUC: {rf_auc:.4f}')

# # Train on full data
# rf_model_full = RandomForestClassifier(n_estimators=10, random_state=42)
# rf_model_full.fit(X_enc, y)

# # Predictions for submission
# rf_test_pred = rf_model_full.predict(test_enc)

# # Prepare submission
# rf_submission = pd.DataFrame({'auctionId': test_ids, 'isSold': rf_test_pred})
# rf_submission.to_csv('submission_rf.csv', index=False)


Random Forest ROC AUC: 0.8003
