In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import pandas as pd

# Load training data
X_train = pd.read_csv("../data/train/X_train.csv")
y_train = pd.read_csv("../data/train/y_train.csv")

# Load test data
X_test = pd.read_csv("../data/test/X_test.csv")
y_test = pd.read_csv("../data/test/y_test.csv")

# If y_train/y_test is stored as a DataFrame, convert to Series
y_train = y_train.squeeze()  
y_test = y_test.squeeze()  



# Training 
## Logistic Regression


In [3]:
# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.8852
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45139
           1       0.61      0.03      0.06      5931

    accuracy                           0.89     51070
   macro avg       0.75      0.52      0.50     51070
weighted avg       0.85      0.89      0.84     51070



In [5]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check new class distribution
print("Class distribution after SMOTE:", Counter(y_train_smote))


Class distribution after SMOTE: Counter({0: 180555, 1: 180555})


In [8]:
from imblearn.under_sampling import RandomUnderSampler

# Initialize undersampler
undersampler = RandomUnderSampler(random_state=42)

# Apply undersampling
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

# Check new class distribution
print("Class distribution after undersampling:", Counter(y_train_under))


Class distribution after undersampling: Counter({0: 23722, 1: 23722})


In [9]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_under, y_train_under)

# Predictions
y_pred_under = model.predict(X_test)

# Evaluate model
accuracy_under = accuracy_score(y_test, y_pred_under)
print(f"Accuracy: {accuracy_under:.4f}")
print(classification_report(y_test, y_pred_under))

Accuracy: 0.6768
              precision    recall  f1-score   support

           0       0.94      0.67      0.79     45139
           1       0.22      0.70      0.33      5931

    accuracy                           0.68     51070
   macro avg       0.58      0.69      0.56     51070
weighted avg       0.86      0.68      0.73     51070



In [7]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote)

# Predictions
y_pred_smote = model.predict(X_test)

# Evaluate model
accuracy_smote = accuracy_score(y_test, y_pred_smote)
print(f"Accuracy: {accuracy_smote:.4f}")
print(classification_report(y_test, y_pred_smote))

Accuracy: 0.6840
              precision    recall  f1-score   support

           0       0.94      0.68      0.79     45139
           1       0.22      0.69      0.34      5931

    accuracy                           0.68     51070
   macro avg       0.58      0.69      0.57     51070
weighted avg       0.86      0.68      0.74     51070



In [7]:
from sklearn.linear_model import LogisticRegression

# Train model with balanced class weights
clf = LogisticRegression(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      0.67      0.79     45139
           1       0.22      0.70      0.33      5931

    accuracy                           0.68     51070
   macro avg       0.58      0.69      0.56     51070
weighted avg       0.86      0.68      0.73     51070



In [8]:
from xgboost import XGBClassifier

xgb = XGBClassifier(scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]), random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))


              precision    recall  f1-score   support

           0       0.94      0.73      0.82     45139
           1       0.23      0.63      0.34      5931

    accuracy                           0.72     51070
   macro avg       0.59      0.68      0.58     51070
weighted avg       0.86      0.72      0.76     51070



In [9]:
from xgboost import XGBClassifier

xgb = XGBClassifier(scale_pos_weight=len(y_train_under[y_train_under == 0]) / len(y_train_under[y_train_under == 1]), random_state=42)
xgb.fit(X_train_under, y_train_under)

y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))


              precision    recall  f1-score   support

           0       0.94      0.67      0.79     45139
           1       0.22      0.68      0.33      5931

    accuracy                           0.67     51070
   macro avg       0.58      0.68      0.56     51070
weighted avg       0.86      0.67      0.73     51070



In [10]:
from xgboost import XGBClassifier

xgb = XGBClassifier(scale_pos_weight=len(y_train_smote[y_train_smote == 0]) / len(y_train_smote[y_train_smote == 1]), random_state=42)
xgb.fit(X_train_smote, y_train_smote)

y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))


              precision    recall  f1-score   support

           0       0.89      0.99      0.94     45139
           1       0.54      0.09      0.15      5931

    accuracy                           0.89     51070
   macro avg       0.72      0.54      0.55     51070
weighted avg       0.85      0.89      0.85     51070



In [18]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [ 0.1, 0.2],
    'n_estimators': [100, 200],
    'scale_pos_weight': [ 1,5, 10]  # Adjust for class imbalance
}

# Initialize classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Perform grid search
grid_search = GridSearchCV(xgb, param_grid, scoring='f1', cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train_smote, y_train_smote)

# Best parameters
print("Best parameters:", grid_search.best_params_)

# Train the best model
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_xgb:.4f}")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 36 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'scale_pos_weight': 1}
Accuracy: 0.8849
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     45139
           1       0.53      0.09      0.15      5931

    accuracy                           0.88     51070
   macro avg       0.71      0.54      0.54     51070
weighted avg       0.85      0.88      0.85     51070



In [12]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.83      0.88     45139
           1       0.29      0.53      0.37      5931

    accuracy                           0.79     51070
   macro avg       0.61      0.68      0.63     51070
weighted avg       0.86      0.79      0.82     51070



In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid_rf = {
    'n_estimators': [200],
    'max_depth': [10],
    'min_samples_split': [5],
    'min_samples_leaf': [2, 5],
    'class_weight': ['balanced']
}

# Initialize model
rf = RandomForestClassifier(random_state=42)

# Perform grid search
grid_search_rf = GridSearchCV(rf, param_grid_rf, scoring='f1', cv=5, verbose=2, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Get best model
best_rf = grid_search_rf.best_estimator_

# Evaluate
y_pred_rf = best_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))


NameError: name 'X_train' is not defined

In [16]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV


# Define parameter grid
param_grid_lgbm = {
    'num_leaves': [100],
    'learning_rate': [0.2],
    'n_estimators': [200]
}

# Initialize model
lgbm = lgb.LGBMClassifier(random_state=42)

# Perform grid search
grid_search_lgbm = GridSearchCV(lgbm, param_grid_lgbm, scoring='f1', cv=5, verbose=2, n_jobs=-1)
grid_search_lgbm.fit(X_train, y_train)

# Get best model
best_lgbm = grid_search_lgbm.best_estimator_

# Evaluate
y_pred_lgbm = best_lgbm.predict(X_test)
# Evaluate model
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
print(f"Accuracy: {accuracy_lgbm:.4f}")
print(classification_report(y_test, y_pred_lgbm))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] Number of positive: 23722, number of negative: 180555
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1327
[LightGBM] [Info] Number of data points in the train set: 204277, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116127 -> initscore=-2.029633
[LightGBM] [Info] Start training from score -2.029633
Accuracy: 0.8839
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     45139
           1       0.50      0.09      0.15      5931

    accuracy                           0.88     51070
   macro avg       0.70      0.54      0.54     51070
weighted avg       0.85      0.88      0.85     51070





In [26]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.05, auto_class_weights='Balanced')
model.fit(X_train, y_train)


ModuleNotFoundError: No module named 'catboost'

In [2]:
from sklearn.ensemble import StackingClassifier
final_model = StackingClassifier(
    estimators=[('xgb', xgb), ('rf', rf)],
    final_estimator=LogisticRegression()
)
final_model.fit(X_train, y_train)


NameError: name 'xgb' is not defined