In [1]:
# üì¶ Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

# üì• Load datasets (make sure path is correct and competition is added to your notebook)
train = pd.read_csv('/kaggle/input/mc-datathon-2025-churn-detection/train.csv')
test = pd.read_csv('/kaggle/input/mc-datathon-2025-churn-detection/test.csv')

# üîñ Save test IDs for final submission
test_ids = test['id'].copy()

# üß† Convert target 'Churn' to binary
train['Churn'] = train['Churn'].map({'Yes': 1, 'No': 0})

# üî¢ Convert 'TotalCharges' to numeric (handle missing/invalid)
train['TotalCharges'] = pd.to_numeric(train['TotalCharges'], errors='coerce').fillna(0)
test['TotalCharges'] = pd.to_numeric(test['TotalCharges'], errors='coerce').fillna(0)

# üîç Identify categorical columns to encode
cat_cols = train.select_dtypes(include='object').columns.tolist()

# üîÅ Label encode categorical columns
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]]).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# üß™ Prepare features and labels
X = train.drop(columns=['id', 'Churn'])
y = train['Churn']
X_test = test.drop(columns=['id'])

# ‚öôÔ∏è Train LightGBM model
model = LGBMClassifier(
    random_state=42,
    class_weight='balanced',
    n_estimators=100
)
model.fit(X, y)

# üìä Predict probabilities for class "Yes"
probs = model.predict_proba(X_test)[:, 1]

# üü¢üü• Convert probabilities to binary predictions (threshold 0.5)
final_preds = (probs > 0.5).astype(int)

# üîÑ Map predictions back to "Yes"/"No"
churn_labels = pd.Series(final_preds).map({1: "Yes", 0: "No"})

# üíæ Create submission DataFrame
submission = pd.DataFrame({
    "id": test_ids,
    "Churn": churn_labels
})

# ‚úÖ Save submission to Kaggle's output directory
submission.to_csv("/kaggle/working/submission.csv", index=False)

# üîç Optional: Show a preview
print("‚úÖ submission.csv generated!")
print(submission.head())


[LightGBM] [Info] Number of positive: 1495, number of negative: 4139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003101 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
‚úÖ submission.csv generated!
   id Churn
0   0    No
1   1   Yes
2   2    No
3   3   Yes
4   4    No
