In [1]:
drop_cols = [
    "binome_id",
    "binome_statut",
    "binome_date_proposition",
    "binome_date_creation",
    "binome_date_update_statut",
    "binome_cancellation_reason",
    "registration_date_x",
    "registration_date_y",
    "status_flag"  # target
]

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv("~/Team-4/output.csv")

# Target variable
y = df["status_flag"]

# Drop irrelevant columns
X = df.drop(columns=drop_cols)

# Identify categorical vs numeric columns
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

In [3]:
# Define preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

In [4]:
# Define model
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)

In [5]:
# Build pipeline
clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])


In [6]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
clf.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

Accuracy: 0.7436077057793345
ROC AUC: 0.8126869188087201
              precision    recall  f1-score   support

           0       0.84      0.79      0.82      2039
           1       0.55      0.62      0.58       816

    accuracy                           0.74      2855
   macro avg       0.69      0.71      0.70      2855
weighted avg       0.76      0.74      0.75      2855

