In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Training and Testing on the Dev_data_to_be_shared(train.csv)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score
import pandas as pd
import numpy as np

# Step 1: Load and Split Data
df = pd.read_csv('/content/drive/MyDrive/convolve/train.csv')
df.set_index('account_number',inplace=True)
X = df.drop("bad_flag", axis=1)  # Features
y = df["bad_flag"]  # Target class label

# Step 2: Remove rows with more than 40% missing values (except where class label is 1)
missing_percentage = X.isnull().mean(axis=1)
rows_to_remove = (missing_percentage > 0.4) & (y != 1)  # Remove rows with >40% missing, but keep class label 1
X_cleaned = X[~rows_to_remove]
y_cleaned = y[~rows_to_remove]

# Train-test split (after cleaning)
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, stratify=y_cleaned, random_state=42)

# Step 3: Feature Selection using Double Ensemble

# 3.1: Train Random Forest and XGBoost to get feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth = 10, criterion="gini", class_weight="balanced")
rf.fit(X_train, y_train)

xgb = XGBClassifier(n_estimators=100, random_state=42, scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]))
xgb.fit(X_train, y_train)

# 3.2: Get Feature Importances from Random Forest and XGBoost
rf_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
xgb_importances = pd.Series(xgb.feature_importances_, index=X_train.columns)

# 3.3: Combine the Importances (average them)
combined_importance = (rf_importances + xgb_importances) / 2
sorted_importance = combined_importance.sort_values(ascending=False)

# 3.4: Select Top Features based on Combined Importance
top_features = sorted_importance.head(600).index  # You can adjust the number of top features here

# Step 4: Train Models on the Selected Features

# 4.1: Select Features from the Training and Test Sets
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

# 4.2: Train Random Forest and LightGBM with the selected features
rf_final = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_final.fit(X_train_selected, y_train)

lgb_final = LGBMClassifier(n_estimators=100, random_state=42, class_weight="balanced")
lgb_final.fit(X_train_selected, y_train)

# Step 5: Evaluate the Model

# 5.1: Get Predictions from Random Forest and LightGBM
rf_probs = rf_final.predict_proba(X_test_selected)[:, 1]
lgb_probs = lgb_final.predict_proba(X_test_selected)[:, 1]

# 5.2: Combine Predictions for Final Decision
combined_probs = (rf_probs + lgb_probs) / 2

# 5.3: Make Final Predictions based on Combined Probabilities
threshold = 0.5  # Default threshold for binary classification
y_pred = (combined_probs >= threshold).astype(int)

# 5.4: Evaluate the Ensemble Model
roc_auc = roc_auc_score(y_test, combined_probs)
f1 = f1_score(y_test, y_pred)

# Output Evaluation Metrics
print(f"AUC Score: {roc_auc:.4f}")
print(f"F1 Score: {f1:.4f}")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 1098, number of negative: 55799
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.142906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 103668
[LightGBM] [Info] Number of data points in the train set: 56897, number of used features: 600
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
AUC Score: 0.8529
F1 Score: 0.3944
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     13951
           1       0.86      0.26      0.39       274

    accuracy                           0.98     14225
   macro avg       0.92      0.63      0.69     14225
weighted avg       0.98      0.98      0.98     14225



Training on the Dev_data_to_be_shared(train.csv)

In [4]:
# Step 1: Load Data
df = pd.read_csv('/content/drive/MyDrive/convolve/train.csv')
df.set_index('account_number',inplace=True)
X = df.drop("bad_flag", axis=1)  # Features
y = df["bad_flag"]  # Target class label

# Step 2: Remove rows with more than 40% missing values (except where class label is 1)
missing_percentage = X.isnull().mean(axis=1)
rows_to_remove = (missing_percentage > 0.4) & (y != 1)  # Remove rows with >40% missing, but keep class label 1
X_cleaned = X[~rows_to_remove]
y_cleaned = y[~rows_to_remove]

# Step 3: Feature Selection using Double Ensemble

# 3.1: Train Random Forest and XGBoost to get feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth = 10, criterion="gini", class_weight="balanced")
rf.fit(X_cleaned, y_cleaned)

xgb = XGBClassifier(n_estimators=100, random_state=42, scale_pos_weight=(y_cleaned.value_counts()[0] / y_cleaned.value_counts()[1]))
xgb.fit(X_cleaned, y_cleaned)

# 3.2: Get Feature Importances from Random Forest and XGBoost
rf_importances = pd.Series(rf.feature_importances_, index=X_cleaned.columns)
xgb_importances = pd.Series(xgb.feature_importances_, index=X_cleaned.columns)

# 3.3: Combine the Importances (average them)
combined_importance = (rf_importances + xgb_importances) / 2
sorted_importance = combined_importance.sort_values(ascending=False)

# 3.4: Select Top Features based on Combined Importance
top_features = sorted_importance.head(600).index  # You can adjust the number of top features here

# Step 4: Train Models on the Selected Features

# 4.1: Select Features from the Entire Dataset (since we're training on the whole dataset)
X_selected = X_cleaned[top_features]

# 4.2: Train Random Forest and LightGBM with the selected features
rf_final = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_final.fit(X_selected, y_cleaned)

lgb_final = LGBMClassifier(n_estimators=100, random_state=42, class_weight="balanced")
lgb_final.fit(X_selected, y_cleaned)

# Step 5: Evaluate the Model

# 5.1: Get Predictions from Random Forest and LightGBM
rf_probs = rf_final.predict_proba(X_selected)[:, 1]
lgb_probs = lgb_final.predict_proba(X_selected)[:, 1]

# 5.2: Combine Predictions for Final Decision
combined_probs = (rf_probs + lgb_probs) / 2

# 5.3: Make Final Predictions based on Combined Probabilities
threshold = 0.5  # Default threshold for binary classification
y_pred = (combined_probs >= threshold).astype(int)

# 5.4: Evaluate the Ensemble Model
roc_auc = roc_auc_score(y_cleaned, combined_probs)
f1 = f1_score(y_cleaned, y_pred)

# Output Evaluation Metrics
print(f"AUC Score: {roc_auc:.4f}")
print(f"F1 Score: {f1:.4f}")

# Classification Report
print("Classification Report:")
print(classification_report(y_cleaned, y_pred))


[LightGBM] [Info] Number of positive: 1372, number of negative: 69750
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.615231 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106164
[LightGBM] [Info] Number of data points in the train set: 71122, number of used features: 600
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
AUC Score: 1.0000
F1 Score: 0.9993
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     69750
           1       1.00      1.00      1.00      1372

    accuracy                           1.00     71122
   macro avg       1.00      1.00      1.00     71122
weighted avg       1.00      1.00      1.00     71122



Testing on the Validation_data_to_be_shared(test.csv)

In [12]:
# Step 1: Load Data (assuming testing data is stored in another CSV file)
test_df = pd.read_csv('/content/drive/MyDrive/convolve/test.csv')

# Step 2: Set 'account_number' as the index
test_df.set_index('account_number', inplace=True)

# Step 3: Apply the same feature selection to the testing data
# Ensure we select only the features that were used for training, even if there are missing values
X_test_selected = test_df[top_features]

# Step 4: Get Predictions from Random Forest and LightGBM (on the test data)
rf_probs_test = rf_final.predict_proba(X_test_selected)[:, 1]  # Probabilities from Random Forest
lgb_probs_test = lgb_final.predict_proba(X_test_selected)[:, 1]  # Probabilities from LightGBM

# Step 5: Combine the Probabilities from both models for the final prediction
combined_probs_test = (rf_probs_test + lgb_probs_test) / 2

# Step 6: Get the Account Number and Prediction Probabilities
# The 'account_number' is now the index, so no need to extract it separately
test_output = pd.DataFrame({
    'prediction_probability': combined_probs_test
}, index=test_df.index)  # Use the index (account_number) as the row identifier

# Step 7: Save the Result to a CSV file
test_output.to_csv('/content/drive/MyDrive/convolve/test_predictions.csv')

# Confirming the output
print("Predictions saved to '/content/drive/MyDrive/convolve/test_predictions.csv'.")


Predictions saved to '/content/drive/MyDrive/convolve/test_predictions.csv'.


In [14]:
test_output.head(20)

Unnamed: 0_level_0,prediction_probability
account_number,Unnamed: 1_level_1
100001,0.117141
100002,0.999873
100003,0.01197
100004,0.994862
100005,0.044581
100006,0.52954
100007,0.999859
100008,0.004189
100009,0.25047
100010,0.040157
