In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import os 
pd.set_option('mode.chained_assignment', None)

class FeatureExtractor:
    def __init__(self, df, target_column):
        self.df = df
        self.target_column = target_column
        self.categorical_features = []
        self.continuous_features = []
        self._identify_features()

    def _identify_features(self):
        for column in self.df.columns:
            if column == self.target_column:
                continue
            if self.df[column].dtype == 'object' or len(self.df[column].unique()) < 10:
                self.categorical_features.append(column)
            else:
                self.continuous_features.append(column)

    def get_categorical_features(self):
        return self.categorical_features

    def get_continuous_features(self):
        return self.continuous_features


class CategoricalFeature:
    def __init__(self, df, feature, target_column):
        self.df = df
        self.feature = feature
        self.target_column = target_column

    @property
    def df_lite(self):
        df_lite = self.df.copy()
        df_lite['bin'] = df_lite[self.feature].fillna('MISSING')
        return df_lite[['bin', self.target_column]]


class ContinuousFeature:
    def __init__(self, df, feature, target_column):
        self.df = df
        self.feature = feature
        self.target_column = target_column
        self.bin_min_size = int(len(self.df) * 0.05)

    def __generate_bins(self, bins_num):
        df = self.df[[self.feature, self.target_column]].copy()
        df['bin'] = pd.qcut(df[self.feature], bins_num, duplicates='drop').apply(lambda x: x.left).astype(float)
        return df

    def __generate_correct_bins(self, bins_max=20):
        for bins_num in range(bins_max, 1, -1):
            df = self.__generate_bins(bins_num)
            df_grouped = pd.DataFrame(df.groupby('bin').agg({self.feature: 'count', self.target_column: 'sum'})).reset_index()
            r, p = stats.spearmanr(df_grouped['bin'], df_grouped[self.target_column])
            if (abs(r) == 1 and 
                df_grouped[self.feature].min() > self.bin_min_size and 
                not (df_grouped[self.feature] == df_grouped[self.target_column]).any()):
                break
        return df

    @property
    def df_lite(self):
        df_lite = self.__generate_correct_bins()
        # Handle missing values without inplace assignment
        df_lite['bin'] = df_lite['bin'].fillna('MISSING')
        return df_lite[['bin', self.target_column]]


class IV:
    @staticmethod
    def __perc_share(df, group_name):
        return df[group_name] / df[group_name].sum()

    def __calculate_perc_share(self, feat):
        df = feat.df_lite.groupby('bin').agg({feat.target_column: ['count', 'sum']}).reset_index()
        df.columns = [feat.feature, 'count', 'good']
        df['bad'] = df['count'] - df['good']
        return df

    def __calculate_woe(self, feat):
        df = self.__calculate_perc_share(feat)
        
        # Calculate percentages while avoiding division by zero
        total_good = df['good'].sum()
        total_bad = df['bad'].sum()

        # Avoid division by zero by adding a small value (epsilon)
        epsilon = 1e-10
        
        # Calculate WOE safely
        with np.errstate(divide='ignore', invalid='ignore'):
            df['perc_good'] = (df['good'] + epsilon) / (total_good + epsilon)
            df['perc_bad'] = (df['bad'] + epsilon) / (total_bad + epsilon)
            df['woe'] = np.log(df['perc_good'] / df['perc_bad'])
        
        return df

    def calculate_iv(self, feat):
        iv_df = self.__calculate_woe(feat)
        
        # Calculate IV safely while avoiding NaN values in WOE calculation
        iv_df['iv'] = (iv_df['perc_good'] - iv_df['perc_bad']) * iv_df['woe']
        
        return iv_df, iv_df['iv'].sum()

    @staticmethod
    def interpretation(iv):
        if iv < 0.02:
            return 'useless'
        elif iv < 0.1:
            return 'weak'
        elif iv < 0.3:
            return 'medium'
        elif iv < 0.5:
            return 'strong'
        else:
            return 'suspicious'


class Analysis:
    def analyze_features(self, feats):
        results = {}
        
        for feat in feats:
            iv_instance = IV()
            iv_df, iv_value = iv_instance.calculate_iv(feat)
            results[feat.feature] = {
                "IV": iv_value,
                "Interpretation": iv_instance.interpretation(iv_value),
                "WOE DataFrame": iv_df
            }
        
        return results


def export_iv_to_excel(results_dict, filename='IV_results.xlsx'):
    """
    Export IV results to an Excel file.
    
    :param results_dict: Dictionary containing IV results.
    :param filename: Name of the output Excel file.
    """
    full_path = os.path.abspath(filename)  # Get the full path of the file
    
    with pd.ExcelWriter(filename) as writer:
        for feature, result in results_dict.items():
            result_df = pd.DataFrame(result["WOE DataFrame"])
            result_df.to_excel(writer, sheet_name=feature[:30], index=False)  
    
    print(f"Excel file saved at: {full_path}") 

In [None]:

# Load your dataset
df = pd.read_csv(r"Churn_Modelling.csv")
target_column_name = 'Exited'  

# Identify features
feature_extractor = FeatureExtractor(df, target_column_name)
categorical_columns = feature_extractor.get_categorical_features()
continuous_columns = feature_extractor.get_continuous_features()

print("Categorical Features:", categorical_columns)
print("Continuous Features:", continuous_columns)

# Create feature instances for analysis
cat_features = [CategoricalFeature(df, col, target_column_name) for col in categorical_columns]
cont_features = [ContinuousFeature(df, col, target_column_name) for col in continuous_columns]

# Analyze features for IV and WOE
analysis_instance = Analysis()
results_cat = analysis_instance.analyze_features(cat_features)
results_cont = analysis_instance.analyze_features(cont_features)

# Combine results from both categorical and continuous features into one dictionary for export
combined_results = {**results_cat, **results_cont}

# Export IV results to Excel file and print full path
export_iv_to_excel(combined_results)

print("IV results exported to Excel.")