In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import SMOTENC, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd

# Separate features and target variable
X = df_final.drop(['target'], axis=1)
y = df_final['target']

# Automatically extract categorical feature indices
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
categorical_feature_indices = [X.columns.get_loc(col) for col in categorical_columns]

# Split into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the target size for majority classes after undersampling and for minority classes after oversampling
undersampling_target = 5000
oversampling_target = 6000

# Step 1: Undersample the majority classes
undersampling_strategy = {cls: min(count, undersampling_target)
                          for cls, count in Counter(y_train).items()}
undersampler = RandomUnderSampler(sampling_strategy=undersampling_strategy, random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

# Step 2: SMOTENC for oversampling
smote = SMOTENC(categorical_features=categorical_feature_indices,
                sampling_strategy={cls: max(count, oversampling_target)
                                   for cls, count in Counter(y_train_under).items()},
                random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_under, y_train_under)

# Step 3: ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_under, y_train_under)

# Step 4: Define models for comparison
models = {
    "BalancedRandomForest": BalancedRandomForestClassifier(n_estimators=100, random_state=42),
    "EasyEnsemble": EasyEnsembleClassifier(n_estimators=100, random_state=42),
    "RandomForest_Weighted": RandomForestClassifier(class_weight='balanced', random_state=42),
    "XGBoost_ScalePosWeight": XGBClassifier(
        scale_pos_weight=(len(y_train) - sum(y_train == 1)) / sum(y_train == 1),
        random_state=42
    )
}

# Train and evaluate each model with different balanced data
results = {}
for model_name, model in models.items():
    # Select the appropriate training data for each model
    if model_name in ["BalancedRandomForest", "EasyEnsemble"]:
        # Use undersampled data for these ensemble models
        X_train_balanced, y_train_balanced = X_train_under, y_train_under
    else:
        # Use SMOTE or ADASYN balanced data for other models
        X_train_balanced, y_train_balanced = X_train_smote, y_train_smote

    # Train the model
    model.fit(X_train_balanced, y_train_balanced)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for multi-class
    classification_rep = classification_report(y_test, y_pred, output_dict=True)

    # Store the results
    results[model_name] = {
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": classification_rep['macro avg']['precision'],
        "recall": classification_rep['macro avg']['recall']
    }

# Display results for comparison
results_df = pd.DataFrame(results).T
print(results_df.sort_values(by='f1_score', ascending=False))

# 1. Use Ensemble Methods like BalancedRandomForest or EasyEnsemble:
- BalancedRandomForest: An ensemble method that combines random under-sampling with bagging. It trains multiple decision trees on different balanced subsets of the data.
- EasyEnsemble: Creates multiple balanced subsets of the majority class and trains a classifier on each, then aggregates their predictions.

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.metrics import classification_report, accuracy_score

# Example with BalancedRandomForest
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)
print("Balanced Random Forest Performance:")
print(classification_report(y_test, y_pred_brf))

# Example with EasyEnsemble
eec = EasyEnsembleClassifier(n_estimators=100, random_state=42)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print("Easy Ensemble Classifier Performance:")
print(classification_report(y_test, y_pred_eec))

# 2. Try Class Weights in Models:
- Many classifiers like RandomForest, LogisticRegression, XGBoost, and others allow you to set class_weight='balanced', which automatically adjusts the weights inversely proportional to class frequencies.
- For models like XGBoost or LightGBM, you can manually specify scale_pos_weight for handling imbalance.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# RandomForest with class weights
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest with Balanced Class Weights Performance:")
print(classification_report(y_test, y_pred_rf))

In [None]:
from xgboost import XGBClassifier

# Example with XGBoost
xgb = XGBClassifier(scale_pos_weight=(len(y_train) - sum(y_train == 1)) / sum(y_train == 1))
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost with Scale_Pos_Weight Performance:")
print(classification_report(y_test, y_pred_xgb))

# 4. Advanced Sampling Techniques like ADASYN:
- ADASYN (Adaptive Synthetic Sampling) is similar to SMOTE but focuses more on generating samples for harder-to-learn minority class samples.
- It can be used when you need a more adaptive approach to handling class imbalance.

In [None]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

ADASYN might be more effective than SMOTE when some minority samples are harder to classify.

# 5. Feature Engineering and Data Augmentation:
Focus on domain-specific feature engineering to create new features that might better differentiate between classes.
Use data augmentation techniques (e.g., synthetic data generation, transformations) if applicable to your domain.

# 6. Deep Learning Models with Custom Loss Functions:
If you have a large dataset, using deep learning models with custom loss functions like Focal Loss can be effective.
Focal Loss focuses more on harder-to-classify samples, making it suitable for imbalanced data.

In [None]:
import tensorflow as tf

def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        epsilon = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
        cross_entropy = -y_true * tf.math.log(y_pred)
        weight = alpha * tf.pow(1 - y_pred, gamma)
        loss = weight * cross_entropy
        return tf.reduce_mean(loss)
    return focal_loss_fixed

Focal loss can be used in models like TensorFlow or PyTorch for better handling of imbalance.

# 7. Stacking or Blending Models:
Combining multiple models using stacking or blending can help capture different aspects of the data distribution.
This might be effective if different models handle different classes better.

# 8. Adjust Evaluation Metrics:
- If the issue is related to the perceived performance based on evaluation metrics, focus on metrics like AUC-ROC, F1-score, Precision-Recall curve, G-mean, or Cohen's Kappa rather than just accuracy.
- These metrics provide better insights into model performance on imbalanced data.


# Conclusion:
- Using ensemble methods like BalancedRandomForest and EasyEnsemble or gradient boosting techniques with class weights often yields significant improvements.
Techniques like Focal Loss in deep learning, ADASYN, and advanced feature engineering can also offer gains.
- Adjusting the evaluation metric might also reveal improvements that are not reflected in accuracy.
- Consider testing these approaches on your dataset and evaluating the impact using metrics like precision, recall, and F1-score to determine which method provides the best balance between class performance.