--------
# Try - Model Training


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv("../artifacts/train_set.csv")
test_df = pd.read_csv("../artifacts/test_set.csv")

In [None]:
# Drop features 1712 to 1734
features_to_drop = [f'Feature_{i}' for i in range(1712, 1735)]
train_df = train_df.drop(columns=features_to_drop)
test_df = test_df.drop(columns=features_to_drop)

In [None]:
# Reduce Multicollinearity (for both datasets)
# Drop Feature_2, Feature_6, Feature_7, Feature_8, Feature_9, Feature_2032
cols_to_drop = ['Feature_2', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_2032']
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop)


In [None]:
# Clip Extreme Values Across All Features (5th to 95th percentiles)
numeric_cols = train_df.drop(columns=['ID', 'CLASS']).columns
for col in numeric_cols:
    # Compute bounds on training data
    lower_bound, upper_bound = train_df[col].quantile([0.05, 0.95])
    # Apply clipping to both train and test
    train_df[col] = train_df[col].clip(lower=lower_bound, upper=upper_bound)
    test_df[col] = test_df[col].clip(lower=lower_bound, upper=upper_bound)

In [None]:
# Prepare Features and Target
X_train = train_df.drop(columns=['ID', 'CLASS'])
y_train = train_df['CLASS']
X_test = test_df.drop(columns=['ID', 'CLASS'])
y_test = test_df['CLASS']

In [None]:
# Drop Near-Constant Features
stds = X_train.std()
low_variance_cols = stds[stds < 1e-6].index
print(f"Dropping {len(low_variance_cols)} near-constant features: {low_variance_cols}")
X_train = X_train.drop(columns=low_variance_cols)
X_test = X_test.drop(columns=low_variance_cols)

In [None]:
# Feature Scaling (fit on training data, transform both)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [None]:
# Check for infinite/NaN values after scaling
for X, name in [(X_train_scaled, 'train'), (X_test_scaled, 'test')]:
    if np.any(np.isinf(X)) or np.any(np.isnan(X)):
        print(f"Infinite or NaN values found in {name} set after scaling. Replacing with 0...")
        X = X.replace([np.inf, -np.inf], 0)
        X = X.fillna(0)

# Step 9: Feature Selection using Random Forest Importance (based on training data)
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train_scaled, y_train)

# Select top 150 features
feature_importance = pd.Series(rf.feature_importances_, index=X_train_scaled.columns)
top_features = feature_importance.nlargest(150).index
X_train_selected = X_train_scaled[top_features]
X_test_selected = X_test_scaled[top_features]
print("Top 10 Feature Importances:")
print(feature_importance.nlargest(10))

# Step 10: Random Forest (Enhanced Tuning)
rf_param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), rf_param_grid, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_selected, y_train)
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test_selected)
print(f"Best Random Forest Parameters: {rf_grid.best_params_}")
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Step 11: Tune XGBoost
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [5, 7, 10],
    'n_estimators': [200, 300]
}
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, eval_metric='logloss'), xgb_param_grid, cv=5, scoring='f1', n_jobs=-1)
xgb_grid.fit(X_train_selected, y_train)
xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test_selected)
print(f"Best XGBoost Parameters: {xgb_grid.best_params_}")
print("Tuned XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# Step 12: Tune Logistic Regression
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}
lr_grid = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000), lr_param_grid, cv=5, scoring='f1', n_jobs=-1)
lr_grid.fit(X_train_selected, y_train)
lr_best = lr_grid.best_estimator_
y_pred_lr = lr_best.predict(X_test_selected)
print(f"Best Logistic Regression Parameters: {lr_grid.best_params_}")
print("Tuned Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

# Step 13: Cross-Validation F1-Score for All Models (on training data)
models = {
    'Random Forest': rf_best,
    'XGBoost': xgb_best,
    'Logistic Regression': lr_best
}
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='f1')
    print(f"{name} Cross-Validation F1-Scores (on training data): {cv_scores}")
    print(f"Average CV F1-Score for {name}: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

Dropping 159 near-constant features: Index(['Feature_1910', 'Feature_1911', 'Feature_1912', 'Feature_1913',
       'Feature_1914', 'Feature_1915', 'Feature_1916', 'Feature_1917',
       'Feature_1918', 'Feature_1919',
       ...
       'Feature_2981', 'Feature_2982', 'Feature_3097', 'Feature_3098',
       'Feature_3104', 'Feature_3107', 'Feature_3225', 'Feature_3226',
       'Feature_3232', 'Feature_3235'],
      dtype='object', length=159)
Top 10 Feature Importances:
Feature_3038    0.005962
Feature_1982    0.004320
Feature_1905    0.003341
Feature_3085    0.002977
Feature_3166    0.002975
Feature_2965    0.002781
Feature_3150    0.002723
Feature_1701    0.002636
Feature_2909    0.002607
Feature_2912    0.002572
dtype: float64
Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.78      0.70        58
           1       0

In [None]:
train_df = pd.read_csv("../artifacts/train_set.csv")
test_df = pd.read_csv("../artifacts/test_set.csv")

# Load the unseen dataset (without CLASS column)
# Adjust file path as needed
unseen_df = pd.read_csv("../artifacts/blinded_test_set.csv")

----

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score, accuracy_score, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv("../artifacts/train_set.csv")
test_df = pd.read_csv("../artifacts/test_set.csv")

# Load the unseen dataset (without CLASS column)
unseen_df = pd.read_csv("../artifacts/blinded_test_set.csv")

# Step 2: Handle Missing Values (for all datasets)
# Drop features 1712 to 1734
features_to_drop = [f'Feature_{i}' for i in range(1712, 1735)]
train_df = train_df.drop(columns=features_to_drop)
test_df = test_df.drop(columns=features_to_drop)

# Step 3: Reduce Multicollinearity (for all datasets)
# Drop Feature_2, Feature_6, Feature_7, Feature_8, Feature_9, Feature_2032
cols_to_drop = ['Feature_2', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_2032']
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop)

# Step 4: Clip Extreme Values Across All Features (5th to 95th percentiles)
numeric_cols = train_df.drop(columns=['ID', 'CLASS']).columns
for col in numeric_cols:
    # Compute bounds on training data
    lower_bound, upper_bound = train_df[col].quantile([0.05, 0.95])
    # Apply clipping to all datasets
    train_df[col] = train_df[col].clip(lower=lower_bound, upper=upper_bound)
    test_df[col] = test_df[col].clip(lower=lower_bound, upper=upper_bound)

# Step 6: Prepare Features and Target (Skipping Log Transformation)
X_train = train_df.drop(columns=['ID', 'CLASS'])
y_train = train_df['CLASS']
X_test = test_df.drop(columns=['ID', 'CLASS'])
y_test = test_df['CLASS']

# Step 7: Drop Near-Constant Features (based on training data)
stds = X_train.std()
low_variance_cols = stds[stds < 1e-6].index
print(f"Dropping {len(low_variance_cols)} near-constant features: {low_variance_cols}")
X_train = X_train.drop(columns=low_variance_cols)
X_test = X_test.drop(columns=low_variance_cols)

# Step 8: Feature Scaling (fit on training data, transform all)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Check for infinite/NaN values after scaling
for X, name in [(X_train_scaled, 'train'), (X_test_scaled, 'test')]:
    if np.any(np.isinf(X)) or np.any(np.isnan(X)):
        print(f"Infinite or NaN values found in {name} set after scaling. Replacing with 0...")
        X = X.replace([np.inf, -np.inf], 0)
        X = X.fillna(0)

# Step 9: Feature Selection using Random Forest Importance (based on training data)
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train_scaled, y_train)

# Select top 150 features
feature_importance = pd.Series(rf.feature_importances_, index=X_train_scaled.columns)
top_features = feature_importance.nlargest(150).index
X_train_selected = X_train_scaled[top_features]
X_test_selected = X_test_scaled[top_features]
print("Top 10 Feature Importances:")
print(feature_importance.nlargest(10))

# Function to compute all metrics
def compute_metrics(y_true, y_pred, y_pred_proba, model_name):
    print(f"\n{model_name} Metrics (on Test Set):")
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.3f}")
    
    # AUROC
    auroc = roc_auc_score(y_true, y_pred_proba)
    print(f"AUROC: {auroc:.3f}")
    
    # Confusion Matrix for Sensitivity and Specificity
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)  # Recall/TPR for class 1
    specificity = tn / (tn + fp)  # TNR for class 0
    print(f"Sensitivity (Recall/TPR): {sensitivity:.3f}")
    print(f"Specificity (TNR): {specificity:.3f}")
    
    # F1-score for class 1
    f1_class1 = f1_score(y_true, y_pred, pos_label=1)
    print(f"F1-score (class 1): {f1_class1:.3f}")
    
    # Macro average F1-score
    f1_macro = f1_score(y_true, y_pred, average='macro')
    print(f"F1-score (macro avg): {f1_macro:.3f}")
    
    # Classification Report
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    
    return f1_macro

# Step 10: Random Forest (Enhanced Tuning)
rf_param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), rf_param_grid, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_selected, y_train)
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test_selected)
y_pred_rf_proba = rf_best.predict_proba(X_test_selected)[:, 1]  # Probabilities for class 1
f1_rf = compute_metrics(y_test, y_pred_rf, y_pred_rf_proba, "Random Forest")
print(f"Best Random Forest Parameters: {rf_grid.best_params_}")

# Step 11: Tune XGBoost
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [5, 7, 10],
    'n_estimators': [200, 300]
}
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, eval_metric='logloss'), xgb_param_grid, cv=5, scoring='f1', n_jobs=-1)
xgb_grid.fit(X_train_selected, y_train)
xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test_selected)
y_pred_xgb_proba = xgb_best.predict_proba(X_test_selected)[:, 1]
f1_xgb = compute_metrics(y_test, y_pred_xgb, y_pred_xgb_proba, "XGBoost")
print(f"Best XGBoost Parameters: {xgb_grid.best_params_}")

# Step 12: Tune Logistic Regression
lr_param_grid = {
    'C': [0.5, 1, 2, 5, 10],
    'solver': ['liblinear', 'lbfgs']
}
lr_grid = GridSearchCV(LogisticRegression(class_weight={0: 1, 1: 1.5}, random_state=42, max_iter=1000), lr_param_grid, cv=5, scoring='f1', n_jobs=-1)
lr_grid.fit(X_train_selected, y_train)
lr_best = lr_grid.best_estimator_
y_pred_lr = lr_best.predict(X_test_selected)
y_pred_lr_proba = lr_best.predict_proba(X_test_selected)[:, 1]
f1_lr = compute_metrics(y_test, y_pred_lr, y_pred_lr_proba, "Logistic Regression")
print(f"Best Logistic Regression Parameters: {lr_grid.best_params_}")

# Step 13: Select and Save the Best Model
# Compare Macro Average F1-score to select the best model
f1_scores = {
    'Random Forest': f1_rf,
    'XGBoost': f1_xgb,
    'Logistic Regression': f1_lr
}
best_model_name = max(f1_scores, key=f1_scores.get)
best_model = rf_best if best_model_name == 'Random Forest' else xgb_best if best_model_name == 'XGBoost' else lr_best
print(f"\nBest Model: {best_model_name} with Macro Avg F1-score: {f1_scores[best_model_name]:.3f}")
joblib.dump(best_model, f'{best_model_name.lower().replace(" ", "_")}_model.pkl')
print(f"Best model saved as '{best_model_name.lower().replace(' ', '_')}_model.pkl'")

# Step 14: Cross-Validation F1-Score for All Models (on training data)
models = {
    'Random Forest': rf_best,
    'XGBoost': xgb_best,
    'Logistic Regression': lr_best
}
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='f1')
    print(f"\n{name} Cross-Validation F1-Scores (on training data): {cv_scores}")
    print(f"Average CV F1-Score for {name}: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

Dropping 159 near-constant features: Index(['Feature_1910', 'Feature_1911', 'Feature_1912', 'Feature_1913',
       'Feature_1914', 'Feature_1915', 'Feature_1916', 'Feature_1917',
       'Feature_1918', 'Feature_1919',
       ...
       'Feature_2981', 'Feature_2982', 'Feature_3097', 'Feature_3098',
       'Feature_3104', 'Feature_3107', 'Feature_3225', 'Feature_3226',
       'Feature_3232', 'Feature_3235'],
      dtype='object', length=159)
Top 10 Feature Importances:
Feature_3038    0.005962
Feature_1982    0.004320
Feature_1905    0.003341
Feature_3085    0.002977
Feature_3166    0.002975
Feature_2965    0.002781
Feature_3150    0.002723
Feature_1701    0.002636
Feature_2909    0.002607
Feature_2912    0.002572
dtype: float64

Random Forest Metrics (on Test Set):
Accuracy: 0.620
AUROC: 0.659
Sensitivity (Recall/TPR): 0.405
Specificity (TNR): 0.776
F1-score (class 1): 0.472
F1-score (macro avg): 0.588
Classification Report:
              precision    recall  f1-score   support

      

In [None]:


# Load the training dataset (for preprocessing consistency)
train_df = pd.read_csv("../artifacts/train_set.csv")
test_df = pd.read_csv("../artifacts/test_set.csv")

# Load the unseen dataset (without CLASS column)
unseen_df = pd.read_csv("../artifacts/blinded_test_set.csv")


# Step 2: Handle Missing Values (for all datasets)
# Drop features 1712 to 1734
features_to_drop = [f'Feature_{i}' for i in range(1712, 1735)]
train_df = train_df.drop(columns=features_to_drop)
unseen_df = unseen_df.drop(columns=[col for col in features_to_drop if col in unseen_df.columns])

# Step 3: Reduce Multicollinearity (for all datasets)
# Drop Feature_2, Feature_6, Feature_7, Feature_8, Feature_9, Feature_2032
cols_to_drop = ['Feature_2', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_2032']
train_df = train_df.drop(columns=cols_to_drop)
unseen_df = unseen_df.drop(columns=[col for col in cols_to_drop if col in unseen_df.columns])

# Step 4: Clip Extreme Values Across All Features (5th to 95th percentiles)
numeric_cols = train_df.drop(columns=['ID', 'CLASS']).columns
for col in numeric_cols:
    # Compute bounds on training data
    lower_bound, upper_bound = train_df[col].quantile([0.05, 0.95])
    # Apply clipping to both datasets
    train_df[col] = train_df[col].clip(lower=lower_bound, upper=upper_bound)
    if col in unseen_df.columns:
        unseen_df[col] = unseen_df[col].clip(lower=lower_bound, upper=upper_bound)

# Step 6: Prepare Features (Skipping Log Transformation)
X_train = train_df.drop(columns=['ID', 'CLASS'])
# Unseen data has no CLASS column
X_unseen = unseen_df.drop(columns=['ID'], errors='ignore')  # Drop ID if present, ignore if not

# Step 7: Drop Near-Constant Features (based on training data)
stds = X_train.std()
low_variance_cols = stds[stds < 1e-6].index
print(f"Dropping {len(low_variance_cols)} near-constant features: {low_variance_cols}")
X_train = X_train.drop(columns=low_variance_cols)
X_unseen = X_unseen.drop(columns=[col for col in low_variance_cols if col in X_unseen.columns])

# Step 8: Feature Scaling (fit on training data, transform unseen)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_unseen_scaled = scaler.transform(X_unseen)
X_unseen_scaled = pd.DataFrame(X_unseen_scaled, columns=X_unseen.columns)

# Check for infinite/NaN values after scaling
for X, name in [(X_train_scaled, 'train'), (X_unseen_scaled, 'unseen')]:
    if np.any(np.isinf(X)) or np.any(np.isnan(X)):
        print(f"Infinite or NaN values found in {name} set after scaling. Replacing with 0...")
        X = X.replace([np.inf, -np.inf], 0)
        X = X.fillna(0)

# Step 9: Feature Selection using Random Forest Importance (based on training data)
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train_scaled, y_train)

# Select top 150 features
feature_importance = pd.Series(rf.feature_importances_, index=X_train_scaled.columns)
top_features = feature_importance.nlargest(150).index
X_train_selected = X_train_scaled[top_features]
X_unseen_selected = X_unseen_scaled[top_features]
print("Top 10 Feature Importances:")
print(feature_importance.nlargest(10))

# Step 10: Load the Saved Best Model
# Adjust the file name based on the best model saved (e.g., 'logistic_regression_model.pkl')
best_model = joblib.load('logistic_regression_model.pkl')  # Update with actual file name

# Step 11: Predict on Unseen Dataset
y_pred_unseen = best_model.predict(X_unseen_selected)

# Create a DataFrame with predictions and ID (if present in unseen data)
if 'ID' in unseen_df.columns:
    predictions_df = pd.DataFrame({
        'ID': unseen_df['ID'],
        'Predicted_CLASS': y_pred_unseen
    })
else:
    predictions_df = pd.DataFrame({
        'Index': range(len(y_pred_unseen)),
        'Predicted_CLASS': y_pred_unseen
    })

# Output the predictions
print("Predictions for Unseen Dataset:")
print(predictions_df)

# Save predictions to CSV
predictions_df.to_csv('unseen_predictions.csv', index=False)
print("Predictions saved to 'unseen_predictions.csv'")

Dropping 159 near-constant features: Index(['Feature_1910', 'Feature_1911', 'Feature_1912', 'Feature_1913',
       'Feature_1914', 'Feature_1915', 'Feature_1916', 'Feature_1917',
       'Feature_1918', 'Feature_1919',
       ...
       'Feature_2981', 'Feature_2982', 'Feature_3097', 'Feature_3098',
       'Feature_3104', 'Feature_3107', 'Feature_3225', 'Feature_3226',
       'Feature_3232', 'Feature_3235'],
      dtype='object', length=159)
Top 10 Feature Importances:
Feature_3038    0.005962
Feature_1982    0.004320
Feature_1905    0.003341
Feature_3085    0.002977
Feature_3166    0.002975
Feature_2965    0.002781
Feature_3150    0.002723
Feature_1701    0.002636
Feature_2909    0.002607
Feature_2912    0.002572
dtype: float64
Predictions for Unseen Dataset:
        ID  Predicted_CLASS
0   ID_101                0
1   ID_102                1
2   ID_103                0
3   ID_104                0
4   ID_105                0
5   ID_106                1
6   ID_107                0
7   ID