In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib
%matplotlib inline
from scipy import stats
from datetime import datetime 
from tqdm import tqdm

import openpyxl
import os
import shap

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from imblearn.over_sampling import ADASYN, SMOTE, BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, multilabel_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import ConfusionMatrixDisplay, matthews_corrcoef

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

from sklearn.feature_selection import chi2, SelectKBest, f_classif, RFE

from sklearn.semi_supervised import SelfTrainingClassifier, LabelSpreading, LabelPropagation

pd.set_option('display.max_columns', None)

In [54]:
# import pkg_resources

# def create_requirements_file(output_file='requirements.txt'):
#     # Get the list of all installed packages
#     installed_packages = pkg_resources.working_set

#     # Write the package names and versions to the output file
#     with open(output_file, 'w') as f:
#         for package in installed_packages:
#             f.write(f"{package.project_name}=={package.version}\n")

#     print(f"All dependencies have been written to {output_file}")

# # Call the function to create the requirements file
# create_requirements_file()

# Feature engineering 

In [None]:
def separate_columns(df):
    categorical_cols = []
    numerical_cols = []
    
    for col in df.columns:
        if df[col].dtype in ['object', 'category'] or df[col].nunique() <= 10:
            categorical_cols.append(col)
        else:
            numerical_cols.append(col)
    
    return categorical_cols, numerical_cols

categorical_cols, numerical_cols = separate_columns(df)#### Insert df of choice
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)


num_col = numerical_cols

cat_col = categorical_cols

In [None]:
def display_value_counts(df):
    categorical_feat = categorical_cols
    
    for col in categorical_feat:
        print(f"Value counts for column: {col}")
        print(df[col].value_counts())
        print("\n")


display_value_counts(df) #### Insert df of choice

####  Imputation strategies 

In [65]:
Imputation_data_dictionary = {
    "Median": ["List the features you want to impute using median strategy"
    ],
    "Frequent": ['List the features you want to impute using mode strategy'
    ],
    "MICE": ["List the features you want to impute using MICE strategy"
    ],
    "KNN": ['List the features you want to impute using KNN strategy'
    ]
}

In [66]:
Knn_Imputation_dictionary = {
    "features_for_imputation": ["independent features for KNN"
    ],
    "columns_to_impute": [ 'the column that need imputaion using knn'
    ]
}

In [67]:
def null_values_summary(df):
    # Create a DataFrame to store the results
    summary = pd.DataFrame(columns=['Column', 'Null Count', 'Null Percentage'])
    
    # Iterate over each column in the DataFrame
    for col in df.columns:
        null_count = df[col].isnull().sum()
        null_percentage = (null_count / len(df)) * 100
        fill = df[col].count()
        distinct_count = df[col].nunique()
        if distinct_count <= 10:
            unique = df[col].unique()
        else:
            unique = "N/A"
        summary = pd.concat([summary, pd.DataFrame({'Column': [col], 'Null Count': [null_count], 'Null Percentage': [null_percentage],
                                                    'Fill count': [fill],
                                                    'Distinct value count': [distinct_count], 'Unique values': [unique], 'Type': [None]
                                                    })], ignore_index=True)
    
    # Plotting the results
    summary.set_index('Column', inplace=True)
    summary['Null Percentage'].plot(kind='barh', figsize=(10, 8), color='skyblue')
    plt.xlabel('Percentage of Null Values')
    plt.title('Percentage of Null Values by Column')
    plt.show()
    
    return summary

In [None]:
null_eda_df = null_values_summary(df)

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor

def impute_df(df, Imputation_data_dictionary, Knn_Imputation_dictionary):
    """
    Impute missing values in the DataFrame based on the specified rules.

    Args:
        df (pd.DataFrame): Input DataFrame.
        Imputation_data_dictionary (dict): Dictionary with data source categories as keys and list of column names as values.
        Knn_Imputation_dictionary (dict): Dictionary with KNN imputation details.

    Returns:
        pd.DataFrame: DataFrame with imputed values.
    """
    # Extract features for imputation and columns to impute from the dictionary
    features_for_imputation = Knn_Imputation_dictionary["features_for_imputation"]
    columns_to_impute = Knn_Imputation_dictionary["columns_to_impute"]
    
    # Create the KNN imputer
    knn_imputer = KNNImputer()
    # 
    # Create the median imputer
    median_imputer = SimpleImputer(strategy='median')
    # Initialize the mode imputer
    mode_imputer = SimpleImputer(strategy='most_frequent')
    
    # Create the Iterative imputer with RF regressor estimator
    rf_estimator = RandomForestRegressor()
    iterative_imputer = IterativeImputer(estimator=rf_estimator)
    
    # Impute based on the strategy in the dictionary
    for col in df.columns:
        if df[col].isna().any():  # Check if the column has any missing values
            print(f"Column '{col}' has missing values.")
            if col in Imputation_data_dictionary['Median']:
                print(f"Imputing {col} using Median strategy.")
                df[col] = median_imputer.fit_transform(df[[col]])
            elif col in Imputation_data_dictionary['KNN_Imputer']:
                if col in columns_to_impute:
                    print(f"Imputing {col} using KNN strategy with features: {features_for_imputation}")
                    # Create a DataFrame with the selected features and the column to impute
                    df_to_impute = df[features_for_imputation + [col]]
                    # Perform the KNN imputation
                    imputed_data = knn_imputer.fit_transform(df_to_impute)
                    # Update the column with the imputed values
                    df[col] = imputed_data[:, -1]
            elif col in Imputation_data_dictionary['Frequent']:
                print(f"Imputing {col} using Mode Imputer.")
                df[[col]] = mode_imputer.fit_transform(df[[col]])
            elif col in Imputation_data_dictionary['MICE']:
                print(f"Imputing {col} using MICE Imputer.")
                df[[col]] = iterative_imputer.fit_transform(df[[col]])
        else:
            print(f"Column '{col}' has no missing values, no imputation needed.")
            
    return df

# Example usage:
df_pre_imp = df.copy()  #### Insert df of choice
imputed_df = impute_df(df_pre_imp, Imputation_data_dictionary, Knn_Imputation_dictionary) 

In [None]:
null_eda_df_1 = null_values_summary(imputed_df)

In [None]:
imputed_df.describe()

In [None]:
df.describe()

In [77]:
df_test = df.copy()

### Exhaustive pipeline to help select best model and transformation along with outliering technique 

In [81]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore, iqr, skew
from tqdm import tqdm

# Function to handle outliers on a per-column basis
def handle_outliers_columnwise(X, method='zscore', threshold=3):
    X_outliers_handled = X.copy()
    for column in X.columns:
        if method == 'zscore':
            X_outliers_handled = X_outliers_handled[(np.abs(zscore(X_outliers_handled[column])) < threshold)]
        elif method == 'iqr':
            lower_bound = X[column].quantile(0.25) - (1.5 * iqr(X[column]))
            upper_bound = X[column].quantile(0.75) + (1.5 * iqr(X[column]))
            X_outliers_handled = X_outliers_handled[(X_outliers_handled[column] >= lower_bound) & (X_outliers_handled[column] <= upper_bound)]
        elif method == 'isolation_forest':
            iso = IsolationForest(contamination=0.1, random_state=42)
            yhat = iso.fit_predict(X_outliers_handled[[column]])
            X_outliers_handled = X_outliers_handled[yhat != -1]
    return X_outliers_handled

# Function to determine the best transformation for each column based on skewness
def get_best_transformations(X):
    transformations = []
    best_transformations = {}
    for i, column in enumerate(X.columns):
        col_skew = skew(X[column])
        if col_skew > 1:
            if (X[column] > 0).all():
                transformations.append((f'{column}_boxcox', PowerTransformer(method='box-cox'), [i]))
                best_transformations[column] = 'Box-Cox'
            else:
                transformations.append((f'{column}_yeojohnson', PowerTransformer(method='yeo-johnson'), [i]))
                best_transformations[column] = 'Yeo-Johnson'
        elif 0.5 < col_skew <= 1:
            transformations.append((f'{column}_sqrt', FunctionTransformer(np.sqrt), [i]))
            best_transformations[column] = 'Square Root'
        else:
            transformations.append((f'{column}_none', FunctionTransformer(None), [i]))
            best_transformations[column] = 'None'
    return transformations, best_transformations

# Define models to be used
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='mlogloss')
}

# Define outlier handling techniques
outlier_methods = ['zscore', 'iqr', 'isolation_forest']

# Define feature selection methods
feature_selectors = {
    'RFE': RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=10),
    'SelectKBest': SelectKBest(score_func=f_classif, k=10),
}

# Store the best results
best_results = {
    'model': None,
    'f1_score': 0,
    'outlier_method': None,
    'transformation': None,
    'features': None,
    'best_transformations': None,  # To store the best transformations per column
}

In [82]:
# Data splitting
X = df_test.drop(columns=['Target', 'party_id','rank_1'])
y = df_test['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get column-specific transformations based on the training data
transformations, best_transformations = get_best_transformations(X_train)

# Create the column transformer using column indices
column_transformer = ColumnTransformer(
    transformers=transformations,
    remainder='passthrough'
)

# Total number of combinations
total_combinations = len(outlier_methods) * len(feature_selectors) * len(models)
progress_bar = None
# Initialize progress bar
progress_bar = tqdm(total=total_combinations, desc="Processing Combinations")

# Iterate through all combinations of outlier methods, feature selection, and models
for outlier_method in outlier_methods:
    # Handle outliers
    X_train_outliers_handled = handle_outliers_columnwise(X_train.copy(), method=outlier_method)
    y_train_outliers_handled = y.loc[X_train_outliers_handled.index]

    for feature_selector_name, feature_selector in feature_selectors.items():
        for model_name, model in models.items():
            # Create a pipeline
            pipeline = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
                ('transformer', column_transformer),  # Apply column-specific transformations
                ('scaler', StandardScaler()),  # Standardize features
                ('feature_selection', feature_selector),  # Feature selection
                ('model', model)  # Model training
            ])
            
            # Train and evaluate using cross-validation
            f1_scores = cross_val_score(pipeline, X_train_outliers_handled, y_train_outliers_handled, cv=3, scoring='f1_macro')
            mean_f1_score = np.mean(f1_scores)
            
            # Check if this is the best model so far
            if mean_f1_score > best_results['f1_score']:
                best_results['model'] = model_name
                best_results['f1_score'] = mean_f1_score
                best_results['outlier_method'] = outlier_method
                best_results['transformation'] = 'Column-Specific'
                best_results['features'] = feature_selector_name
                best_results['best_transformations'] = best_transformations
            
            # Update the progress bar
            progress_bar.update(1)

# Close the progress bar
progress_bar.close()

# Print the best results
print(f"Best Model: {best_results['model']}")
print(f"Best F1 Score: {best_results['f1_score']}")
print(f"Best Outlier Handling Method: {best_results['outlier_method']}")
print(f"Best Transformation: {best_results['transformation']}")
print(f"Best Feature Selection Method: {best_results['features']}")

# Print the best transformations for each column
print("\nBest Transformations for Each Column:")
for column, transformation in best_results['best_transformations'].items():
    print(f"Column: {column}, Transformation: {transformation}")

# Final model training on the entire training set and evaluation on the test set
final_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('transformer', column_transformer),
    ('scaler', StandardScaler()),
    ('feature_selection', feature_selectors[best_results['features']]),
    ('model', models[best_results['model']])
])

# Handle outliers in the full training set
X_train_outliers_handled = handle_outliers_columnwise(X_train.copy(), method=best_results['outlier_method'])
y_train_outliers_handled = y.loc[X_train_outliers_handled.index]

# Fit the final model
final_pipeline.fit(X_train_outliers_handled, y_train_outliers_handled)

# Predict and evaluate on the test set
y_pred_test = final_pipeline.predict(X_test)
print("\nFinal Model Classification Report on Test Set:")
print(classification_report(y_test, y_pred_test))

### Checking outlier count 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
def count_outliers(df):
    outlier_counts = {}
   
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].dtype in ['int64', 'float64'] and col != 'party_id' and df[col].nunique() > 11 and col in Imputation_data_dictionary['Median']:
            # Outlier treatment using Median Absolute Deviation (MAD)
            median = df[col].median()
            mad = stats.median_abs_deviation(df[col])
            lower_bound = median - 3 * mad
            upper_bound = median + 3 * mad
           
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            outlier_counts[col] = outliers.shape[0]
   
    return outlier_counts
 
def plot_outlier_counts(outlier_counts):
    columns = list(outlier_counts.keys())
    counts = list(outlier_counts.values())
   
    plt.figure(figsize=(10, 6))
    plt.bar(columns, counts, color='skyblue')
    plt.xlabel('Columns')
    plt.ylabel('Number of Outliers')
    plt.title('Count of Outliers in Each Column')
    plt.xticks(rotation=90)
    plt.show()
 
# # Example usage
# data = {
#     'A': [1, 2, 2, 3, 4, 100, 6, 7, 8, 9],
#     'B': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
#     'C': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
# }
# df = pd.DataFrame(data)
 
df_test_2 = df_xfr.copy()
 
outlier_counts = count_outliers(df_test_2)
print(outlier_counts)
plot_outlier_counts(outlier_counts)

### Using isolation forest for outlier removal 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest

def isolation_forest_outlier_removal(df, initial_contamination=0.1, max_attempts=5, contamination_adjustment=0.02):
    """
    This function removes outliers based on the Isolation Forest method.
    The function will iteratively adjust the contamination level if the skewness does not reduce after outlier removal.
    """
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64'] and col != 'party_id' and col in Imputation_data_dictionary['Median'] and col not in ['health_envrnmnt_index', 'comp_ptnt_cnt', 'ethnic_hispanic_pct', 'org_hcp_cnt', 'race_black_pct','popltn_wo_health_insrnc']:
            
            attempts = 0
            skew_before = df[col].skew()
            skew_after = skew_before
            contamination = initial_contamination
            
            while abs(skew_after) >= abs(skew_before) and attempts < max_attempts:
                attempts += 1
                print(f"Attempt {attempts}: Adjusting contamination level for column '{col}' (current contamination={contamination})")
                print(f"Skewness of {col} before outlier removal: {skew_before:.4f}")
                
                # Apply Isolation Forest
                iso_forest = IsolationForest(contamination=contamination, random_state=42)
                df['outlier_flag'] = iso_forest.fit_predict(df[[col]])
                
                # Identify outliers (where flag is -1)
                outliers = df['outlier_flag'] == -1
                
                # Replace outliers with the median of the column
                median = df[col].median()
                df.loc[outliers, col] = median
                
                # Recalculate skewness after outlier removal
                skew_after = df[col].skew()
                print(f"Skewness of {col} after outlier removal: {skew_after:.4f}")
                
                if abs(skew_after) >= abs(skew_before):
                    # Adjust the contamination level and retry
                    contamination -= contamination_adjustment
                    contamination = max(0.01, contamination)  # Ensure contamination stays positive
                    print(f"Skewness did not reduce. Adjusting contamination level to {contamination}.")
                
                # Remove the outlier flag column
                df.drop(columns=['outlier_flag'], inplace=True)
            
            if abs(skew_after) >= abs(skew_before):
                print(f"Warning: Even after {attempts} attempts, skewness for column '{col}' did not reduce significantly. Final skewness: {skew_after:.4f}")
            else:
                print(f"Success: Skewness reduced for column '{col}' after {attempts} attempts. Final skewness: {skew_after:.4f}")
    
    return df

# Example usage
df_test_1 = df_xfr.copy()
df_transformed_engagers = isolation_forest_outlier_removal(df_test_1)

### Choosing best transformation per column 

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import boxcox
from sklearn.preprocessing import FunctionTransformer

def robust_transformation(df, skew_threshold=0.5):
    """
    This function applies the appropriate transformation to each numeric column in the dataframe
    to make the distribution as close to normal as possible. It selects the transformation based on the skewness
    and ensures that the skewness is reduced after transformation.
    
    Parameters:
    - df: DataFrame, the data to be transformed.
    - skew_threshold: float, the threshold above which transformations are considered for reducing skewness.

    Returns:
    - transformed_df: DataFrame, the transformed data.
    - transformation_log: dict, log of transformations applied to each column.
    """
    transformation_log = {}

    def log_transform(col):
        return np.log1p(col)

    def sqrt_transform(col):
        return np.sqrt(col)

    def inverse_transform(col):
        return 1 / (col + 1e-6)  # Small epsilon to avoid division by zero

    def boxcox_transform(col):
        # Box-Cox can only be applied to positive values
        transformed_col, _ = boxcox(col + 1e-6)  # Adding a small epsilon to avoid zero values
        return transformed_col

    def square_transform(col):
        return np.power(col, 2)

    def cube_transform(col):
        return np.power(col, 3)

    def apply_transformation(col, skewness):
        transformation_methods = []

        if skewness > 2:
            if (col > 0).all():
                transformation_methods.append(('Box-Cox', boxcox_transform))
            transformation_methods.append(('Log', log_transform))
        if skewness > 1:
            transformation_methods.append(('Square Root', sqrt_transform))
        if skewness < -1:
            transformation_methods.append(('Inverse', inverse_transform))
        if skewness < -0.5:
            transformation_methods.append(('Cube', cube_transform))
        if -0.5 < skewness < 0:
            transformation_methods.append(('Square', square_transform))

        return transformation_methods

    transformed_df = df.copy()

    for col in transformed_df.columns:
        if transformed_df[col].dtype in ['int64', 'float64'] and col != 'party_id' and col in Imputation_data_dictionary['Median'] and col not in ['health_envrnmnt_index', 'comp_ptnt_cnt', 'ethnic_hispanic_pct', 'org_hcp_cnt', 'race_black_pct','popltn_wo_health_insrnc']:
            skew_before = transformed_df[col].skew()
            print(f"Initial skewness for column '{col}': {skew_before:.4f}")

            if abs(skew_before) > skew_threshold:
                transformation_methods = apply_transformation(transformed_df[col], skew_before)
                
                for method_name, transform_func in transformation_methods:
                    transformed_col = transform_func(transformed_df[col])
                    skew_after = pd.Series(transformed_col).skew()
                    print(f"Attempting {method_name} transformation on column '{col}'...")
                    print(f"Skewness after {method_name}: {skew_after:.4f}")

                    if abs(skew_after) < abs(skew_before):
                        transformed_df[col] = transformed_col
                        transformation_log[col] = method_name
                        print(f"Success: {method_name} transformation reduced skewness for column '{col}' to {skew_after:.4f}")
                        break
                    else:
                        print(f"{method_name} transformation did not reduce skewness for column '{col}'.")
                else:
                    print(f"Warning: None of the transformations reduced skewness for column '{col}'.")
                    transformation_log[col] = 'None'
            else:
                print(f"No transformation applied to column '{col}' (skewness: {skew_before:.4f})")
                transformation_log[col] = 'None'

    return transformed_df, transformation_log

# Example usage
df_test_3 = df_transformed_engagers.copy()  # Replace df_xfr with your dataframe
df_transformed_engagers_xfr, transformation_log = robust_transformation(df_test_3)

# View the log of transformations applied
print("\nTransformation Log:")
for col, trans in transformation_log.items():
    print(f"{col}: {trans}")

### Quick feature selection code 

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif, RFE
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

def feature_selection(df, target_column, variance_threshold=0.1, k_range=range(6, 31)):
    """
    Perform feature selection with variance threshold, SelectKBest, and RFE.
    The best value of k in SelectKBest is chosen based on the highest F1-weighted score.

    Parameters:
    - df: DataFrame, input data
    - target_column: str, name of the target column
    - variance_threshold: float, threshold for variance threshold selection
    - k_range: range, range of k values to try for SelectKBest

    Returns:
    - df_selected: DataFrame, data with selected features
    - selected_features: list, list of selected features
    """
    # Separate features and target
    X = df.drop(columns=[target_column,'party_id'])
    y = df[target_column]
   
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
   
    # Encode categorical columns
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
   
    # 1. Remove features with low variance
    selector = VarianceThreshold(threshold=variance_threshold)
    X_var = selector.fit_transform(X)
    selected_features_var = X.columns[selector.get_support(indices=True)]
   
    # 2. Select K best features based on mutual information
    best_k = None
    best_score = -np.inf
    f1_scorer = make_scorer(f1_score, average='weighted')
    
    for k in k_range:
        selector = SelectKBest(score_func=mutual_info_classif, k=k)
        X_kbest = selector.fit_transform(X_var, y)
        scores = cross_val_score(XGBClassifier(random_state=42, eval_metric='mlogloss'), X_kbest, y, cv=3, scoring=f1_scorer)
        mean_score = np.mean(scores)
        print(f"F1-weighted score for k={k}: {mean_score:.4f}")
        if mean_score > best_score:
            best_score = mean_score
            best_k = k
            selected_features_kbest = selected_features_var[selector.get_support(indices=True)]
    
    print(f"\nBest k value based on F1-weighted score: {best_k}")
   
    # 3. Recursive Feature Elimination (RFE) with XGBoost
    model = XGBClassifier(random_state=42, eval_metric='mlogloss')
    selector = RFE(model, n_features_to_select=best_k)
    X_rfe = selector.fit_transform(X[selected_features_kbest], y)
    selected_features_rfe = selected_features_kbest[selector.get_support(indices=True)]
   
    # Combine selected features
    selected_features = selected_features_rfe
   
    # Create a DataFrame with the selected features
    df_selected = df[selected_features]
    df_selected[target_column] = y
   
    return df_selected, selected_features
 
# Example usage
target_column = 'Target'
df_selected, selected_features = feature_selection(df_transformed_engagers_xfr, target_column)
print("Selected features:", selected_features)

## Multi class 

## PLOT and Sampling function 

In [155]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import numpy as np
import matplotlib.pyplot as plt


def prepare_data_with_sampling(X, y, test_size=0.2, random_state=42, sampling_strategy='auto'):
    """
    This function splits the data into train and test sets, applies smote and returns the resampled datasets.

    Parameters:
    X (DataFrame or ndarray): Feature matrix.
    y (Series or ndarray): Target vector.
    test_size (float): Proportion of the dataset to include in the test split.
    random_state (int): Random seed for reproducibility.
    sampling_strategy (str, float, dict): Sampling strategy for SMOTE and RandomUnderSampler.

    Returns:
    X_train (ndarray): Training features before resampling.
    y_train (ndarray): Training labels before resampling.
    X_test (ndarray): Test features.
    y_test (ndarray): Test labels.
    X_resampled (ndarray): Resampled training features after undersampling and oversampling.
    y_resampled (ndarray): Resampled training labels after undersampling and oversampling.
    """
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    
    # Apply SMOTE to balance the training data after undersampling
    smote = SMOTE(sampling_strategy=sampling_strategy, random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    return X_train, y_train, X_test, y_test, X_resampled, y_resampled

# Example usage
# X, y = your_data  # Replace with your actual data
# X_train, y_train, X_test, y_test, X_resampled, y_resampled = prepare_data_with_sampling(X, y)

def plot_feature_importance(importance, features, model_name):
    # Ensure the features list matches the number of importances
    print("Number of features",len(features))
    print("Number of importance",len(importance))
    if len(features) != len(importance):
        raise ValueError("The length of features does not match the number of importances")
    # Sort feature importances in descending order
    indices = np.argsort(importance)[::-1]

    # Rearrange feature names so they match the sorted feature importances
    names = [features[i] for i in indices]

    # Create plot
    plt.figure(figsize=(5, 15))

    # Create plot title
    plt.title(f"Feature Importance for {model_name}")
    plt.barh(range(len(importance)), importance[indices])

    # Add feature names as x-axis labels
    plt.yticks(range(len(importance)), names)
    
    # Invert y-axis to have the most important features on top
    plt.gca().invert_yaxis()

    # Show plot
    plt.show()

### Creating test and train with sampling

In [157]:
# Assuming df_transformed_engagers is already defined

X = df.drop(columns=['Target'])
y = df['Target']

X_train, y_train, X_test, y_test, X_train_smote, y_train_smote = prepare_data_with_sampling(X, y)


## Modelling function 

In [159]:
# Global variable to store the trained models
Random_forest_model = None
Adaboost_model = None
GB_model = None
XGB_model = None


def train_and_evaluate(models, param_grids, X_train, y_train, X_test, y_test, feature_names, class_weights=None):
    global Random_forest_model, Adaboost_model, GB_model, XGB_model
    for name, model in models.items():
        print(f"Training and tuning {name}...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], verbose=1, n_jobs=-1, return_train_score=True, cv=3, scoring='f1')
        
        # If class weights are provided, use them during training
        if name == 'Random Forest':
            grid_search.fit(X_train, y_train)
            Random_forest_model = grid_search.best_estimator_
        elif name == 'Gradient Boosting':
            grid_search.fit(X_train, y_train)
            GB_model = grid_search.best_estimator_
        elif name == 'Extreme Gradient Boost':
            grid_search.fit(X_train, y_train)
            XGB_model = grid_search.best_estimator_
        else:
            if class_weights and hasattr(model, 'class_weight'):
                grid_search.fit(X_train, y_train, class_weight=class_weights)
            else:
                grid_search.fit(X_train, y_train)
                Adaboost_model = grid_search.best_estimator_
        
        # Predictions
        y_pred_train = grid_search.predict(X_train)
        y_pred_test = grid_search.predict(X_test)
                
        # Print classification reports
        print(f"\nClassification Report for Training Set ({name}):")
        print(classification_report(y_train, y_pred_train))
        print("\n")
        train_cnf_matrix = confusion_matrix(y_train, y_pred_train)
        disp_train = ConfusionMatrixDisplay(confusion_matrix=train_cnf_matrix)
        disp_train.plot(cmap='viridis', values_format='d')
        plt.show()
        
        print(f"\nClassification Report for Test Set ({name}):")
        print(classification_report(y_test, y_pred_test))
        print("\n")
        test_cnf_matrix = confusion_matrix(y_test, y_pred_test)
        disp_test = ConfusionMatrixDisplay(confusion_matrix=test_cnf_matrix)
        disp_test.plot(cmap='inferno', values_format='d')
        plt.show()
            
        # Plot feature importance if the model has the attribute
        if hasattr(grid_search.best_estimator_, 'feature_importances_'):
            plot_feature_importance(grid_search.best_estimator_.feature_importances_, feature_names, name)
        elif hasattr(grid_search.best_estimator_, 'coef_'):
            plot_feature_importance(np.abs(grid_search.best_estimator_.coef_[0]), feature_names, name)
            
        # Evaluate the model using cross-validation
        cv_scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=3, scoring='f1')
        print(f"Cross-validation scores for {name}: {cv_scores}")
        print(f"Mean cross-validation score for {name}: {np.mean(cv_scores)}")
        
        # Print best parameters
        print(f"Best parameters for {name}: {grid_search.best_params_}")

# Define the base classifiers and parameter grids
models = {
    'Extreme Gradient Boost': XGBClassifier(random_state=12,eval_metric='mlogloss'),
}

# models = {
#     'Random Forest': RandomForestClassifier(random_state=12, class_weight='balanced'),   
#     'Gradient Boosting': GradientBoostingClassifier(random_state=12),
#     'Extreme Gradient Boost': XGBClassifier(random_state=12,eval_metric='mlogloss')
# }

In [160]:
# Define the parameter grids for each model
param_grids = {
    'Extreme Gradient Boost': {
        'n_estimators': [ ],
        'learning_rate': [  ],
        'max_depth': [ ],
        'min_child_weight': [ ],
        'gamma': [ ],
        'lambda': [ ],
        'alpha': [ ]
    }
}
feature_names = X_train.columns.tolist()

#### Using Smote 

In [None]:
train_and_evaluate(models, param_grids, X_train_smote, y_train_smote, X_test, y_test,feature_names)

In [162]:
XGB_model

In [None]:

import shap
import matplotlib.pyplot as plt

# Assuming xgb_model_tuned.best_estimator_ is the trained model and X_train_smote is the training data
explainer = shap.TreeExplainer(XGB_model)
SHAP_values = explainer.shap_values(X_train_smote)
shap.initjs()

# Define the target classes
target_classes = ['list the target classes as required '] #### change as per preferance 

# Plot SHAP values for each target class
for cnt, target_class in enumerate(target_classes):
    plt.title(
        f"==============================================\nSHAP Values for {target_class}\n==============================================")
    shap.summary_plot(SHAP_values[:, :, cnt], X_train_smote, plot_type='dot')
    plt.show()

### Check Shap feature importance 

In [None]:

i = 2
explainer = shap.TreeExplainer(XGB_model)  # Use the best estimator from GridSearchCV
SHAP_values = explainer.shap_values(X_train_smote)

# Create DataFrame for absolute mean SHAP values
df_absmeanshap = pd.DataFrame(list(zip(X_train.columns, np.abs(SHAP_values[:,:,i]).mean(axis=0))), columns=['features', 'absmeanshap'])

# Create DataFrame for mean SHAP values
df_meanshap = pd.DataFrame(list(zip(X_train.columns, np.abs(SHAP_values[:,:,i]).mean(axis=0))), columns=['features', 'meanshap'])

# Merge the two DataFrames on 'features'
df_f = pd.merge(df_absmeanshap, df_meanshap, on='features', how='left')

# Sort by 'absmeanshap' in descending order and select top 20 rows
sorted_df = df_f.sort_values(by='absmeanshap', ascending=False).head(20)

sorted_df

### Exhaustive feature selection

In [None]:
class FeatureSelector:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

    def evaluate_model(self, model, X_train_subset, X_test_subset):
        model.fit(X_train_subset, self.y_train)
        y_pred = model.predict(X_test_subset)
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred, average='weighted')
        return accuracy, f1
    

    def filter_method(self, k=25):
        selector = SelectKBest(score_func=f_classif, k=k)
        X_new = selector.fit_transform(self.X_train_scaled, self.y_train)
        selected_features = selector.get_support(indices=True)
        model = RandomForestClassifier(n_estimators=100)
        accuracy, f1 = self.evaluate_model(model, self.X_train_scaled[:, selected_features], self.X_test_scaled[:, selected_features])
        print(f"Filter Method - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
        return selected_features

    def wrapper_rfe(self):
        model = RandomForestClassifier(n_estimators=100)
        rfe = RFE(model, n_features_to_select=25)  # Adjust the number as needed
        fit = rfe.fit(self.X_train_scaled, self.y_train)
        rfe_features = np.where(fit.support_)[0]
        accuracy, f1 = self.evaluate_model(model, self.X_train_scaled[:, rfe_features], self.X_test_scaled[:, rfe_features])
        print(f"RFE Method - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
        return rfe_features
    
    def exhaustive_feature_selection(self, model, min_features=4, max_features=20):
        best_score = 0
        best_features = None
        n_features = self.X_train_scaled.shape[1]
        
        total_combinations = sum([len(list(combinations(range(n_features), k))) for k in range(1, max_features + 1)])
                
        with tqdm(total=total_combinations, desc="Feature Selection Progress") as pbar:
            for k in range(1, max_features + 1):
                for subset in combinations(range(n_features), k):
                    X_train_subset = self.X_train_scaled[:, subset]
                    X_test_subset = self.X_test_scaled[:, subset]
                    accuracy, f1 = self.evaluate_model(clone(model), X_train_subset, X_test_subset)
                    if f1 > best_score:
                        best_score = f1
                        best_features = subset
                    pbar.update(1)
                    
        print(f"Exhaustive Feature Selection - Best f1: {best_score:.4f}")
        return best_features

    def wrapper_sfs(self):
        model = RandomForestClassifier(n_estimators=100)
        sfs = SFS(model, 
                      k_features='14', 
                  floating=True, 
                  scoring='f1_weighted',
                  cv=5,              
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  verbose=0)  # Turn off verbose in SFS and use tqdm instead
        
            # Adding a progress bar
        n_features = self.X_train_scaled.shape[1]
        with tqdm(total=n_features) as pbar:
            for i in range(1, n_features + 1):
                sfs.k_features = i
                sfs = sfs.fit(self.X_train_scaled, self.y_train)
                pbar.update(1)
                
        self.sfs_results = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
        self.sfs_features = sfs.k_feature_idx_

        # Evaluate model performance
        X_train_subset = self.X_train_scaled[:, list(self.sfs_features)]
        X_test_subset = self.X_test_scaled[:, list(self.sfs_features)]
        model.fit(X_train_subset, self.y_train)
        y_pred = model.predict(X_test_subset)
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred, average='weighted')
        print(f"SFS Method - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
        return self.sfs_features , self.sfs_results

    def embedded_method(self):
        model = RandomForestClassifier(n_estimators=100)
        model.fit(self.X_train_scaled, self.y_train)
        importances = model.feature_importances_
        self.feature_importances = dict(zip(self.X.columns, importances))
        self.embedded_indices = np.argsort(importances)[::-1]
        
        # Plot feature importances
        feature_names = np.array(self.X.columns)[self.embedded_indices]
        sorted_importances = importances[self.embedded_indices]
        
        plt.figure(figsize=(10, 6))
        plt.barh(feature_names, sorted_importances, color='skyblue')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.title('Feature Importances')
        plt.gca().invert_yaxis()
        plt.show()
        
        # Evaluate model performance
        X_train_subset = self.X_train_scaled[:, self.embedded_indices]
        X_test_subset = self.X_test_scaled[:, self.embedded_indices]
        accuracy, f1 = self.evaluate_model(model, X_train_subset, X_test_subset)
        print(f"Embedded Method - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
        
        return self.embedded_indices

    def plot_sfs(self):
        if not hasattr(self, 'sfs_results'):
            raise ValueError("SFS has not been run yet.")
        
        plt.figure(figsize=(14, 7))
        plt.plot(range(1, len(self.sfs_results) + 1), self.sfs_results['avg_score'], marker='o', color='b', label='F1 Score')
        
        # # Assuming accuracy was also recorded
        # if 'avg_score' in self.sfs_results.columns:
        #     plt.plot(range(1, len(self.sfs_results) + 1), self.sfs_results['avg_score'], marker='o', color='g', label='Accuracy')
        
        plt.title('Sequential Feature Selection (SFS) Performance')
        plt.xlabel('Number of Features')
        plt.ylabel('Score')
        plt.grid(True)
        plt.legend()
        plt.show()
        

    def get_selected_features(self):
        filter_features = self.filter_method()
        rfe_features = self.wrapper_rfe()
        sfs_features,sfs_results = self.wrapper_sfs()
        embedded_features = self.embedded_method()
        exhaustive_features = self.exhaustive_feature_selection(RandomForestClassifier(n_estimators=100))
        
        sfs_results.to_csv("sfs_results.csv", index=False)
        
        return {
            'Filter Method': filter_features,
            'RFE Method': rfe_features,
            'SFS Method': sfs_features,
            'Exhaustive Method': exhaustive_features,
            'Embedded Method': embedded_features[:25] # Top 25 features
        }

    def list_feature_importances(self):
        if not hasattr(self, 'feature_importances'):
            raise ValueError("Embedded method has not been run yet.")
        
        return self.feature_importances
    
 
# ##Example Usage

# selector = FeatureSelector(X, y)
# selected_features = selector.get_selected_features()
# print("Selected Features:")
# for method, features in selected_features.items():
#     print(f"{method}: {features}")

# # Print feature importances
# feature_importances = selector.list_feature_importances()
# print("\nFeature Importances:")
# for feature, importance in feature_importances.items():
#     print(f"{feature}: {importance:.4f}")

# print("Feature Importances:")
# print(selector.list_feature_importances())

# # Plot SFS performance
# selector.wrapper_sfs()  # Run SFS to populate sfs_results
# selector.plot_sfs()  # Plot SFS performance
    