In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/winter-2025-machine-learning-competition-1-p/sample_submission.csv
/kaggle/input/winter-2025-machine-learning-competition-1-p/train.csv
/kaggle/input/winter-2025-machine-learning-competition-1-p/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/winter-2025-machine-learning-competition-1-p/train.csv')
test = pd.read_csv('/kaggle/input/winter-2025-machine-learning-competition-1-p/test.csv',index_col=[0])

In [3]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split

import matplotlib.pyplot as plt

In [4]:
train_df = train.copy()
y = train_df.pop("Label")
X = train_df.drop(['X3', 'X1'],axis=1)
test = test.drop(['X3', 'X1'],axis=1)

In [5]:
# Split the data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=42)

In [6]:
### Random Forest

rf = RandomForestClassifier(n_estimators=240, max_depth=6, min_samples_split=4, min_samples_leaf = 3, max_features="log2", n_jobs=-1,random_state=0) # remember the random_state!!!
rf.fit(X_train, y_train)

In [7]:
train_probs1 = rf.predict_proba(X_train)[:, 1]
val_probs1 = rf.predict_proba(X_val)[:,1]
roc_auc_score(y_val,val_probs1,max_fpr=.01), roc_auc_score(y_train, train_probs1, max_fpr=.01)

(0.9360362850126562, 0.9367507668754085)

In [8]:
### XGBoost

XGBC_model = XGBClassifier(enable_categorical = True, n_estimators=20, learning_rate=0.11, gamma = 0.1)
XGBC_model.fit(X_train, y_train)

In [9]:
train_probs2 = XGBC_model.predict_proba(X_train)[:, 1]
val_probs2 = XGBC_model.predict_proba(X_val)[:,1]
roc_auc_score(y_val,val_probs2,max_fpr=.01), roc_auc_score(y_train, train_probs2, max_fpr=.01)

(0.9409462783003517, 0.9412219903589916)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [11]:
log = LogisticRegression(max_iter=3000, solver='saga', C = 1, random_state=0)
log.fit(X_train_scaled, y_train)



In [12]:
train_probs3 = log.predict_proba(X_train_scaled)[:,1]
val_probs3 = log.predict_proba(X_val_scaled)[:,1]
roc_auc_score(y_val,val_probs3,max_fpr=.01), roc_auc_score(y_train, train_probs3, max_fpr=.01)

(0.9160063881486518, 0.8989094628780376)

In [13]:
test_probs1 = rf.predict_proba(test)[:,1]
test_probs2 = XGBC_model.predict_proba(test)[:,1]
test_probs3 = log.predict_proba(test)[:,1]



In [14]:
training_data_stack = np.column_stack((train_probs1,train_probs2,train_probs3))
train_stack = np.column_stack((val_probs1,val_probs2,val_probs3))
test_stack = np.column_stack((test_probs1,test_probs2,test_probs3))

In [15]:
final_model = LogisticRegression(max_iter=5000, solver='lbfgs', random_state=0)
final_model.fit(train_stack,y_val)

training_predictions = final_model.predict_proba(train_stack)[:,1]
training_data_predictions = final_model.predict_proba(training_data_stack)[:,1]
test_probs = final_model.predict_proba(test_stack)[:,1]

In [16]:
roc_auc_score(y_val,training_predictions,max_fpr=.01), roc_auc_score(y_train,training_data_predictions,max_fpr=.01)

(0.9434301073314448, 0.9452016909493719)

In [17]:
test_probs.shape, test.shape

((113918,), (113918, 28))

In [18]:
# Built-in feature importance (Gini Importance)
importances = rf.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': X_train.columns, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)

   Feature  Gini Importance
14     X17         0.272027
11     X14         0.205812
8      X11         0.141282
13     X16         0.081013
1       X4         0.038553
7      X10         0.031874
18     X21         0.027385
0       X2         0.019367
2       X5         0.019321
23     X26         0.016983
17     X20         0.016316
9      X12         0.012595
5       X8         0.011750
4       X7         0.011490
25     X28         0.010931
24     X27         0.009616
26     X29         0.009263
19     X22         0.007853
12     X15         0.007697
3       X6         0.007529
21     X24         0.006020
15     X18         0.006017
16     X19         0.005899
27    Time         0.005875
6       X9         0.005509
22     X25         0.004655
20     X23         0.003863
10     X13         0.003503


In [19]:
submission = pd.DataFrame({'id':test.index, 'Label':test_probs})
submission.set_index('id').to_csv("submission.csv")