In [1]:
import os

os.path.exists(r"C:\Users\Anannya\demo-project\data\training_data.csv")

True

the dataset file is loaded and attached correctly

In [1]:
import pandas as pd

train = pd.read_csv(r"C:\Users\Anannya\demo-project\data\training_data.csv")

In [3]:
print(train.shape)
print(train['target'].value_counts())
print(train['target'].value_counts(normalize=True))

(476169, 52)
target
0    458814
1     17355
Name: count, dtype: int64
target
0    0.963553
1    0.036447
Name: proportion, dtype: float64


these values tell us that 0 appears high number of times 96% and 1 occurs only 3.6% of the times hence the data is highly imbalanced

In [10]:
# Feature types
train.dtypes.value_counts()

# Missing values
missing = train.isna().sum()
missing[missing > 0].sort_values(ascending=False).head(10)


feature_39    329010
feature_8     213303
feature_45     86301
feature_38     34128
feature_28      9242
feature_12      4638
feature_31       464
feature_34       173
feature_35        83
feature_15        67
dtype: int64

there are many missing values in feature 39 there are alomst 60% values are missing this output indicates all the features having missing values sorted in decsending order

In [12]:
train.dtypes.value_counts()

int64      34
float64    17
object      1
Name: count, dtype: int64

In [13]:
train.select_dtypes(include='object').columns.tolist()


['id']

all the coulmns are of the type int or float(numeric) none of them is in string type ignore id 

In [2]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['target', 'id'])
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(y_train.value_counts(normalize=True))
print(y_val.value_counts(normalize=True))


target
0    0.963553
1    0.036447
Name: proportion, dtype: float64
target
0    0.963553
1    0.036447
Name: proportion, dtype: float64


we split the training set into 80 - training set and 20% validation set, ensuring that the split betweeen the training and validation set is equal for such highly imbalanced data

the next step is to create a basic logistic regression
We are not trying to win yet.
We are checking “Is there signal in the data?”

Goal of this step
Train a simple, interpretable model
Handle imbalance using class weights
Evaluate using ROC-AUC()

If this baseline is bad, complex models won’t magically fix it.

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [4]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('model', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        n_jobs=-1
    ))
])


In [5]:
pipeline.fit(X_train, y_train)
val_probs = pipeline.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, val_probs)
print("Validation ROC-AUC:", roc)


Validation ROC-AUC: 0.6187174906152606


In [6]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score


In [9]:
import sys
!{sys.executable} -m pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/72.0 MB 11.2 MB/s eta 0:00:07
   --- ------------------------------------ 5.8/72.0 MB 20.7 MB/s eta 0:00:04
   ------ --------------------------------- 11.3/72.0 MB 23.5 MB/s eta 0:00:03
   ------- -------------------------------- 13.1/72.0 MB 19.6 MB/s eta 0:00:04
   ------- -------------------------------- 14.2/72.0 MB 15.9 MB/s eta 0:00:04
   -------- ------------------------------- 16.0/72.0 MB 14.4 MB/s eta 0:00:04
   ---------- ----------------------------- 18.6/72.0 MB 14.0 MB/s eta 0:00:04
   ----------- ---------------------------- 21.5/72.0 MB 13.9 MB/s eta 0:00:04
   ------------- -------------------------- 24.9/72.0 MB 14.1 MB/s eta 0:00:04
   --------------- ------------------------ 27.8/72.0 MB 14.3 MB/s eta 0:0

In [7]:
pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print("scale_pos_weight:", pos_weight)


scale_pos_weight: 26.4369778161913


In [8]:
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=pos_weight,
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)


In [10]:
xgb_model.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
val_probs = xgb_model.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, val_probs)
print("Validation ROC-AUC:", roc)

Validation ROC-AUC: 0.6306842593294735


the roc-auc value did increase but the inc is very less(0.61 - 0.63) so the non linear models are working but current setup is over-optimized and heavy missingness is limiting the performance 

hence the next step is to change data representation we will drop the columns with high missingness and again train the same model to check the roc-auc value

In [12]:
missing_ratio = X_train.isna().mean()
high_missing_cols = missing_ratio[missing_ratio > 0.6].index.tolist()

print("Columns to drop:", high_missing_cols)
print("Number of columns dropped:", len(high_missing_cols))


Columns to drop: ['feature_39']
Number of columns dropped: 1


In [13]:
X_train_pruned = X_train.drop(columns=high_missing_cols)
X_val_pruned = X_val.drop(columns=high_missing_cols)


In [14]:
xgb_model_pruned = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=pos_weight,
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

xgb_model_pruned.fit(X_train_pruned, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [15]:
val_probs = xgb_model_pruned.predict_proba(X_val_pruned)[:, 1]
roc = roc_auc_score(y_val, val_probs)
print("Validation ROC-AUC (pruned):", roc)


Validation ROC-AUC (pruned): 0.6301662384673371


the roc-auc value decreased after removing features with high missing values this means that feature_39 is not junk , Even with ~69% missing values, it still carries signal , Tree model was using it effectively

so now We will:
Increase depth slightly
Increase number of trees
Keep everything else stable

In [16]:
xgb_model_tuned = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.85,
    colsample_bytree=0.85,
    scale_pos_weight=pos_weight,
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

xgb_model_tuned.fit(X_train, y_train)

val_probs = xgb_model_tuned.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, val_probs)
print("Validation ROC-AUC (tuned):", roc)


Validation ROC-AUC (tuned): 0.6062271878573571


In [17]:
best_xgb = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=pos_weight,
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

best_xgb.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [18]:
from sklearn.calibration import CalibratedClassifierCV

calibrated_model = CalibratedClassifierCV(
    best_xgb,
    method='sigmoid',  # Platt scaling
    cv=3
)

calibrated_model.fit(X_train, y_train)


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,method,'sigmoid'
,cv,3
,n_jobs,
,ensemble,'auto'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [19]:
val_probs = calibrated_model.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, val_probs)
print("Validation ROC-AUC (calibrated):", roc)


Validation ROC-AUC (calibrated): 0.6297333485379095


In [21]:
val_probs = xgb_model.predict_proba(X_val)[:, 1]


the validation set is tested on the xgb_model which was defined at first which had an roc-auc score of 0.63

In [22]:
val_preds = (val_probs >= 0.5).astype(int)

from sklearn.metrics import accuracy_score
acc = accuracy_score(y_val, val_preds)
print("Validation Accuracy:", acc * 100, "%")


Validation Accuracy: 70.05901253753912 %
