In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from scipy.stats import entropy
import warnings
warnings.filterwarnings('ignore')

class ImbalancedDataAnalyzer:
    def __init__(self):
        self.feature_selectors = {
            'Anova': self._anova_selection,
            'Chi2': self._chi2_selection,
            'InfoGain': self._infogain_selection,
            'InfoGainRatio': self._infogain_ratio_selection,
            'Relief': self._relief_selection,
            'GiniDecrease': self._gini_decrease_selection,
            'FCBF': self._fcbf_selection
        }
        
        self.classifiers = {
            'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
            'DecisionTree': DecisionTreeClassifier(random_state=42),
            'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
            'NeuralNetwork': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
            'NaiveBayes': GaussianNB(),
            'SVM': SVC(probability=True, random_state=42)
        }
        
        self.results_df = None
        self.best_model = None
        self.best_selector = None
        
    def _anova_selection(self, X, y, k):
        selector = SelectKBest(score_func=f_classif, k=k)
        X_selected = selector.fit_transform(X, y)
        return X_selected, selector.get_support()

    def _chi2_selection(self, X, y, k):
        # Ensure data is non-negative for chi2
        X_scaled = X - X.min() + 1e-6
        selector = SelectKBest(score_func=chi2, k=k)
        X_selected = selector.fit_transform(X_scaled, y)
        return X_selected, selector.get_support()

    def _infogain_selection(self, X, y, k):
        selector = SelectKBest(score_func=mutual_info_classif, k=k)
        X_selected = selector.fit_transform(X, y)
        return X_selected, selector.get_support()

    def _infogain_ratio_selection(self, X, y, k):
        def calculate_igr(X, y):
            mi = mutual_info_classif(X, y)
            intrinsic_value = np.array([entropy(pd.qcut(col, q=10, duplicates='drop').codes) 
                                      for col in X.T])
            intrinsic_value = np.where(intrinsic_value == 0, 1e-10, intrinsic_value)
            return mi / intrinsic_value

        scores = calculate_igr(X, y)
        selected_features = np.argsort(scores)[-k:]
        mask = np.zeros(X.shape[1], dtype=bool)
        mask[selected_features] = True
        return X.iloc[:, selected_features], mask

    def _relief_selection(self, X, y, k):
        def compute_relief_scores(X, y):
            n_samples, n_features = X.shape
            weights = np.zeros(n_features)
            
            for i in range(n_samples):
                same_class = X[y == y[i]]
                diff_class = X[y != y[i]]
                
                # Find nearest hit and miss
                nearest_hit = same_class[np.argmin(np.sum((same_class - X[i])**2, axis=1))]
                nearest_miss = diff_class[np.argmin(np.sum((diff_class - X[i])**2, axis=1))]
                
                weights += np.abs(X[i] - nearest_miss) - np.abs(X[i] - nearest_hit)
            
            return weights / n_samples

        scores = compute_relief_scores(X.values, y)
        selected_features = np.argsort(scores)[-k:]
        mask = np.zeros(X.shape[1], dtype=bool)
        mask[selected_features] = True
        return X.iloc[:, selected_features], mask

    def _gini_decrease_selection(self, X, y, k):
        forest = ExtraTreesClassifier(n_estimators=100, random_state=42)
        forest.fit(X, y)
        scores = forest.feature_importances_
        selected_features = np.argsort(scores)[-k:]
        mask = np.zeros(X.shape[1], dtype=bool)
        mask[selected_features] = True
        return X.iloc[:, selected_features], mask

    def _fcbf_selection(self, X, y, k):
        def symmetrical_uncertainty(x, y):
            mutual_info = mutual_info_classif(x.reshape(-1, 1), y)[0]
            x_entropy = entropy(pd.qcut(x, q=10, duplicates='drop').codes)
            y_entropy = entropy(pd.qcut(y, q=10, duplicates='drop').codes)
            if x_entropy == 0 or y_entropy == 0:
                return 0
            return 2.0 * mutual_info / (x_entropy + y_entropy)

        # Calculate SU between features and class
        su_scores = np.array([symmetrical_uncertainty(X.iloc[:, i], y) 
                            for i in range(X.shape[1])])
        
        # Select top k features based on SU scores
        selected_features = np.argsort(su_scores)[-k:]
        mask = np.zeros(X.shape[1], dtype=bool)
        mask[selected_features] = True
        return X.iloc[:, selected_features], mask

    def evaluate_model(self, y_true, y_pred, y_prob):
        return {
            'F1_Score': f1_score(y_true, y_pred),
            'MCC': matthews_corrcoef(y_true, y_pred),
            'AUC': roc_auc_score(y_true, y_prob[:, 1])
        }

    def analyze_dataset(self, X, y, feature_percentages=[0.75, 0.5]):
        results = []
        n_features = X.shape[1]
        
        # Calculate imbalance ratio
        class_counts = np.bincount(y)
        imbalance_ratio = class_counts.max() / class_counts.min()
        
        print(f"Dataset Information:")
        print(f"Number of features: {n_features}")
        print(f"Number of samples: {len(y)}")
        print(f"Class distribution: {dict(zip(range(len(class_counts)), class_counts))}")
        print(f"Imbalance ratio: {imbalance_ratio:.2f}")
        
        # Split and scale data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
        
        best_f1 = 0
        
        for percentage in feature_percentages:
            k = max(1, int(n_features * percentage))
            print(f"\nAnalyzing with {percentage*100}% of features ({k} features)")
            
            for fs_name, fs_func in self.feature_selectors.items():
                try:
                    print(f"\nApplying {fs_name} feature selection...")
                    X_train_selected, feature_mask = fs_func(X_train_scaled, y_train, k)
                    X_test_selected = X_test_scaled.iloc[:, feature_mask]
                    
                    for clf_name, clf in self.classifiers.items():
                        print(f"Training {clf_name}...", end=' ')
                        clf.fit(X_train_selected, y_train)
                        y_pred = clf.predict(X_test_selected)
                        y_prob = clf.predict_proba(X_test_selected)
                        
                        metrics = self.evaluate_model(y_test, y_pred, y_prob)
                        
                        # Track best model
                        if metrics['F1_Score'] > best_f1:
                            best_f1 = metrics['F1_Score']
                            self.best_model = clf
                            self.best_selector = (fs_func, feature_mask)
                        
                        results.append({
                            'Feature_Selection': fs_name,
                            'Classifier': clf_name,
                            'Feature_Percentage': percentage,
                            'Imbalance_Ratio': imbalance_ratio,
                            **metrics
                        })
                        print(f"F1: {metrics['F1_Score']:.3f}, MCC: {metrics['MCC']:.3f}, AUC: {metrics['AUC']:.3f}")
                        
                except Exception as e:
                    print(f"Error with {fs_name} and {clf_name}: {str(e)}")
                    continue
        
        self.results_df = pd.DataFrame(results)
        return self.results_df

    def plot_results(self):
        if self.results_df is None:
            print("No results available. Please run analyze_dataset first.")
            return
        
        # Create figure with subplots
        fig, axes = plt.subplots(2, 2, figsize=(20, 16))
        
        # Plot 1: Feature Selection Methods Comparison
        sns.boxplot(data=self.results_df, x='Feature_Selection', y='F1_Score', ax=axes[0,0])
        axes[0,0].set_title('Feature Selection Methods Comparison')
        axes[0,0].set_xticklabels(axes[0,0].get_xticklabels(), rotation=45)
        
        # Plot 2: Classifier Comparison
        sns.boxplot(data=self.results_df, x='Classifier', y='F1_Score', ax=axes[0,1])
        axes[0,1].set_title('Classifier Comparison')
        axes[0,1].set_xticklabels(axes[0,1].get_xticklabels(), rotation=45)
        
        # Plot 3: Feature Percentage Comparison
        sns.boxplot(data=self.results_df, x='Feature_Percentage', y='F1_Score', ax=axes[1,0])
        axes[1,0].set_title('Feature Percentage Comparison')
        
        # Plot 4: Metrics Comparison
        metrics_df = self.results_df.groupby('Classifier')[['F1_Score', 'MCC', 'AUC']].mean()
        metrics_df.plot(kind='bar', ax=axes[1,1])
        axes[1,1].set_title('Average Metrics by Classifier')
        axes[1,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=45)
        
        plt.tight_layout()
        return fig

def main():
    # Create synthetic imbalanced datasets with different imbalance ratios
    datasets = []
    
    # Dataset 1: Moderate imbalance (1:3)
    X1, y1 = make_classification(n_samples=846, n_features=18, n_classes=2, 
                               weights=[0.75, 0.25], random_state=42)
    datasets.append(('Dataset 1 (1:3)', pd.DataFrame(X1), y1))
    
    # Dataset 2: High imbalance (1:8)
    X2, y2 = make_classification(n_samples=1484, n_features=8, n_classes=2,
                               weights=[0.89, 0.11], random_state=42)
    datasets.append(('Dataset 2 (1:8)', pd.DataFrame(X2), y2))
    
    # Dataset 3: Very high imbalance (1:15)
    X3, y3 = make_classification(n_samples=214, n_features=9, n_classes=2,
                               weights=[0.94, 0.06], random_state=42)
    datasets.append(('Dataset 3 (1:15)', pd.DataFrame(X3), y3))
    
    analyzer = ImbalancedDataAnalyzer()
    
    # Analyze each dataset
    for dataset_name, X, y in datasets:
        print(f"\n{'='*50}")
        print(f"Analyzing {dataset_name}")
        print('='*50)
        
        # Set feature names if not present
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
        
        # Run analysis
        results = analyzer.analyze_dataset(X, y)
        
        # Plot results
        fig = analyzer.plot_results()
        plt.savefig(f'{dataset_name.replace(" ", "_")}_results.png')
        plt.close()
        
        # Save results to CSV
        results.to_csv(f'{dataset_name.replace(" ", "_")}_results.csv', index=False)
        
        # Print summary
        print("\nTop 5 Configurations:")
        print(results.nlargest(5, 'F1_Score')[['Feature_Selection', 'Classifier', 
                                              'Feature_Percentage', 'F1_Score', 'MCC', 'AUC']])

if __name__ == "__main__":
    from sklearn.datasets import make_classification
   def main():
    # Load datasets
    datasets = []
    
    # Dataset 1
    df1 = pd.read_csv('../input/your-dataset-folder/dataset1.csv')  # Replace with your path
    X1 = df1.drop('target_column', axis=1)  # Replace 'target_column' with your target column name
    y1 = df1['target_column']
    datasets.append(('Dataset 1', X1, y1))
    
    # Dataset 2
    df2 = pd.read_csv('../input/your-dataset-folder/dataset2.csv')  # Replace with your path
    X2 = df2.drop('target_column', axis=1)
    y2 = df2['target_column']
    datasets.append(('Dataset 2', X2, y2))
    
    # Dataset 3
    df3 = pd.read_csv('../input/your-dataset-folder/dataset3.csv')  # Replace with your path
    X3 = df3.drop('target_column', axis=1)
    y3 = df3['target_column']
    datasets.append(('Dataset 3', X3, y3))
    
    analyzer = ImbalancedDataAnalyzer()
    
    # Analyze each dataset
    for dataset_name, X, y in datasets:
        print(f"\n{'='*50}")
        print(f"Analyzing {dataset_name}")
        print('='*50)
        
        # Run analysis
        results = analyzer.analyze_dataset(X, y)
        
        # Plot results
        fig = analyzer.plot_results()
        plt.savefig(f'{dataset_name.replace(" ", "_")}_results.png')
        plt.close()
        
        # Save results to CSV
        results.to_csv(f'{dataset_name.replace(" ", "_")}_results.csv', index=False)
        
        # Print summary
        print("\nTop 5 Configurations:")
        print(results.nlargest(5, 'F1_Score')[['Feature_Selection', 'Classifier', 
                                              'Feature_Percentage', 'F1_Score', 'MCC', 'AUC']])