Import Libraries

In [None]:
import pandas as pd
import lightgbm as lgb
import math
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, log_loss, classification_report, confusion_matrix

Load Data

In [None]:
df = pd.read_csv("preprocessed_data.csv")

Drop Latitude and Longitude

In [None]:
df = df.drop(columns=["Latitude", "Longitude"])
df_0 = df[df["label"] == 0]
df_1 = df[df["label"] == 1]
df = pd.concat([df_0.sample(n=math.floor(len(df_1)*1.0), random_state=42), df_1], axis=0)
df = shuffle(df, random_state=42)

Seperate Features and Labels

In [None]:
X = df.drop(columns=["label"])
y = df["label"]

Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Runs the Model on Training Data

In [None]:
model = lgb.LGBMClassifier(
    # Objective and task parameters
    objective="binary",
    metric="binary_logloss",  
    # metric="auc",
    boosting_type="gbdt",
    
    # Tree structure parameters
    max_depth=-1,
    
    # Learning parameters
    learning_rate=0.1,
    n_estimators=100,
    
    
    # Class imbalance handling
    # scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    # scale_pos_weight=1.5,
    #is_unbalance=True,
    
    # Other parameters
    random_state=42,
    n_jobs=-1,
    verbose=1,
    importance_type="gain"
)

model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='binary_logloss',
    # eval_metric='auc',
)

Model Evaluation and Scoring

In [None]:
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred_class = model.predict(X_test)

auc = roc_auc_score(y_test, y_pred_proba)
logloss = log_loss(y_test, y_pred_proba)

print(f"Test ROC-AUC: {auc:.4f}")
print(f"Test Log Loss: {logloss:.4f}")
print(f"\nClass distribution in test set:")
print(y_test.value_counts(normalize=True))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_class))
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_class))

# Feature Importance Analysis
# feature_importance = pd.DataFrame({
#     'feature': X_train.columns,
#     'importance': model.feature_importances_
# }).sort_values('importance', ascending=False)

# print("\nTop 20 Most Important Features:")
# print(feature_importance.head(20).to_string(index=False))

# print(f"\nTotal features: {len(feature_importance)}")
# print(f"Features with zero importance: {(feature_importance['importance'] == 0).sum()}")