In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)

import pickle
import joblib
import plotly.graph_objects as go
import plotly.express as px

from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz

from scipy.stats import ks_2samp, variation

import string

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.utils import class_weight
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
# Calcular estatísticas descritivas para cada variável em diferentes BASES
def calculate_statistics(df, variable):
    statistics = []
    categories = df['BASE'].unique()
    for i in range(len(categories) - 1):
        current_cat = categories[i]
        next_cat = categories[i + 1]
        subset_current = df[df['BASE'] == current_cat][variable]
        subset_next = df[df['BASE'] == next_cat][variable]
        
        # Remover valores NaN antes de calcular o PSI
        subset_current2 = subset_current.dropna()
        subset_next2 = subset_next.dropna()

        psi = calculate_psi(subset_current2, subset_next2)

        statistics.append({
            'Variable': variable,
            'Current_Category': current_cat,
            'Next_Category': next_cat,
            'min': subset_current.min(),
            '1st_quartile': subset_current.quantile(0.25),
            'mean': subset_current.mean(),
            'median': subset_current.median(),
            '3rd_quartile': subset_current.quantile(0.75),
            'max': subset_current.max(),
            'std_dev': subset_current.std(),
            'count': subset_current.count(),
            'null_count': subset_current.isnull().sum(),
            'PSI': psi
        })

    # Para o último, onde não há próximo, calcular apenas as estatísticas descritivas
    last_cat = categories[-1]
    subset_last = df[df['BASE'] == last_cat][variable]
    statistics.append({
        'Variable': variable,
        'Current_Category': last_cat,
        'Next_Category': np.nan,
        'min': subset_last.min(),
        '1st_quartile': subset_last.quantile(0.25),
        'mean': subset_last.mean(),
        'median': subset_last.median(),
        '3rd_quartile': subset_last.quantile(0.75),
        'max': subset_last.max(),
        'std_dev': subset_last.std(),
        'count': subset_last.count(),
        'null_count': subset_last.isnull().sum(),
        'PSI': np.nan  # PSI não calculado para o último
    })
    return statistics

def calculate_psi(current_dist, next_dist, bins=10):
    """
    Calculate the Population Stability Index (PSI) between current and next distributions.
    
    Parameters:
        current_dist (array-like): Distribution of values for the current period.
        next_dist (array-like): Distribution of values for the next period.
        bins (int): Number of bins to use for calculating distributions.
        
    Returns:
        float: Population Stability Index (PSI) value.
    """
    # Equalize o número de bins entre as distribuições atual e próxima
    _, current_bins = np.histogram(current_dist, bins=bins)
    _, next_bins = np.histogram(next_dist, bins=current_bins)
    
    # Calcular as proporções para cada bin em ambas as distribuições
    current_props = np.histogram(current_dist, bins=current_bins)[0] / len(current_dist)
    next_props = np.histogram(next_dist, bins=next_bins)[0] / len(next_dist)
    
    # Calcular o PSI para cada bin
    psi_values = (next_props - current_props) * np.log(next_props / current_props)
    
    # Filtrar valores NaN e infinitos
    psi_values = psi_values[~np.isnan(psi_values) & ~np.isinf(psi_values)]
    
    # Calcular o PSI como a soma dos valores de PSI para todos os bins
    psi = np.sum(psi_values)
    
    return psi

# Suponha que df_mod_churn seja o nome do seu DataFrame
features_number = [lista]

all_statistics = []
for feature in features_number:
    if feature != 'BASE':  # Se a variável não é 'BASE', calcular estatísticas
        all_statistics.extend(calculate_statistics(df_mod_1o, feature))

# Criar DataFrame com as estatísticas
statistics_df = pd.DataFrame(all_statistics)

In [None]:
statistics_df.head(10)