# LABORATORIO SEMANA 7
Realizado por:

**- Ruelas Flores, César Diego**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.svm import SVC
import statsmodels.api as sm

**Mi metodo tradicional de cargar datos de ucilmrepo**

In [2]:
from ucimlrepo import fetch_ucirepo 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 

In [3]:
# Combinar en un DataFrame
data_directa_ucimlrepo = pd.concat([X, y], axis=1)

#### **Variables Globales**

In [4]:
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
COLUMN_NAMES = ['id', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape',
                'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
                'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']

## FUNCIONES DE UTILIDAD

Función para cargar los datos

In [5]:
def get_data_from_url(url):
    """
    Retrieve data from a CSV into a dataframe.

    input:
    - url: URL of the csv file

    output:
    - dataframe: pd.DataFrame
    """
    return pd.read_csv(url, names=COLUMN_NAMES)

Limpieza del dataset

In [6]:
def clean_data(df):
    """
    Clean the dataset by handling missing values, converting types, 
    and adjusting the target column.

    input:
    - df: Raw dataframe

    output:
    - df: Cleaned dataframe
    """
    df = df.replace('?', np.nan)
    df = df.dropna()
    df['bare_nuclei'] = df['bare_nuclei'].astype(int)
    df['class'] = df['class'].apply(lambda x: 1 if x == 4 else 0)
    df = df.drop(columns=['id'])
    return df

Cálculo de IV

In [7]:
def calculate_iv(df, target):
    """
    Calculate Information Value (IV) for all features in the dataset.

    input:
    - df: Cleaned dataframe
    - target: Target variable name (str)

    output:
    - pd.Series: IV values per feature
    """
    def woe_iv(df, feature, target):
        lst = []
        for val in np.sort(df[feature].unique()):
            count_event = ((df[feature] == val) & (df[target] == 1)).sum()
            count_non_event = ((df[feature] == val) & (df[target] == 0)).sum()
            lst.append([val, count_event, count_non_event])
        data = pd.DataFrame(lst, columns=['Value', 'Event', 'NonEvent'])
        data['Dist_Event'] = data['Event'] / data['Event'].sum()
        data['Dist_NonEvent'] = data['NonEvent'] / data['NonEvent'].sum()
        data['WOE'] = np.log(data['Dist_Event'] / data['Dist_NonEvent'])
        data['IV'] = (data['Dist_Event'] - data['Dist_NonEvent']) * data['WOE']
        return data['IV'].sum()

    iv_dict = {col: woe_iv(df, col, target) for col in df.columns if col != target}
    return pd.Series(iv_dict).sort_values(ascending=False)

El mismo codigo de arriba pero optimizado (con IA)

In [8]:
import pandas as pd
import numpy as np

def calculate_iv_optimized(df: pd.DataFrame, target: str) -> pd.Series:
    """
    Calcula el Information Value (IV) para todas las columnas/características en el dataset.

    Entrada:
    - df: DataFrame (pd.DataFrame) limpio con columnas y variable objetivo binaria (0/1).
    - target: Nombre de la columna objetivo (str).

    Salida:
    - pd.Series: Valores de IV por columna/característica, ordenados descendentemente.
    """
    def woe_iv_optimized_single_feature(df: pd.DataFrame, feature: str, target: str) -> float:
        counts = df.groupby([feature, target]).size()
        count_table = counts.unstack(fill_value=0)

        if 0 not in count_table.columns:
             count_table[0] = 0
        if 1 not in count_table.columns:
             count_table[1] = 1

        count_table = count_table[[0, 1]]
        count_table.columns = ['NonEvent', 'Event']

        total_event = count_table['Event'].sum()
        total_non_event = count_table['NonEvent'].sum()

        if total_event == 0 or total_non_event == 0:
            return 0.0

        epsilon = 1e-6
        count_table['Dist_Event'] = count_table['Event'] / total_event
        count_table['Dist_NonEvent'] = count_table['NonEvent'] / total_non_event

        count_table['Dist_Event'] = count_table['Dist_Event'].replace(0, epsilon)
        count_table['Dist_NonEvent'] = count_table['Dist_NonEvent'].replace(0, epsilon)

        with np.errstate(divide='ignore', invalid='ignore'):
             count_table['WOE'] = np.log(count_table['Dist_Event'] / count_table['Dist_NonEvent'])

        count_table.replace([np.inf, -np.inf], 0, inplace=True)

        count_table['IV'] = (count_table['Dist_Event'] - count_table['Dist_NonEvent']) * count_table['WOE']

        feature_iv = count_table['IV'].sum()

        return feature_iv

    iv_dict = {col: woe_iv_optimized_single_feature(df, col, target)
               for col in df.columns if col != target}

    return pd.Series(iv_dict).sort_values(ascending=False)

Selección de variables fuertes

In [None]:
def select_strong_predictors(iv_series, threshold=0.02):
    """
    Select predictors with strong IV above the threshold.

    input:
    - iv_series: Series with IV per variable
    - threshold: Minimum IV to consider strong

    output:
    - list: Selected features
    """
    return iv_series[iv_series >= threshold].index.tolist()

## A. Calcular IV, excluir variables débiles, separar target y dividir en entrenamiento/prueba

**Cargar, limpiar y preparar datos**

In [10]:
df = get_data_from_url(DATA_URL)
df = clean_data(df)

In [11]:
iv_series = calculate_iv(df, 'class')
print("Information Value por variable:\n", iv_series)
SELECTED_FEATURES = select_strong_predictors(iv_series)

Information Value por variable:
 clump_thickness                inf
uniformity_cell_size           inf
uniformity_cell_shape          inf
marginal_adhesion              inf
single_epithelial_cell_size    inf
bare_nuclei                    inf
bland_chromatin                inf
normal_nucleoli                inf
mitoses                        inf
dtype: float64


In [12]:
iv_series = calculate_iv_optimized(df, 'class')
print("Information Value por variable:\n", iv_series)
SELECTED_FEATURES = select_strong_predictors(iv_series)

Information Value por variable:
 uniformity_cell_size           10.388630
uniformity_cell_shape           8.601120
clump_thickness                 6.532306
bland_chromatin                 6.286301
normal_nucleoli                 6.085052
bare_nuclei                     5.463221
marginal_adhesion               4.880258
single_epithelial_cell_size     4.235413
mitoses                         2.331167
dtype: float64


**DIFERENCIAS????**

Esto ocurre por lo siguiente:

Mi código original produce inf porque no maneja explícitamente el caso donde una categoría de una columna tiene **cero observaciones para una clase objetivo**, lo que lleva a un **log(0) o una división por cero en la fórmula del WOE**. Sin embargo **el código optimizado añade un pequeño valor (epsilon) a las distribuciones que son cero antes de tomar el logaritmo, evitando el resultado infinito y proporcionando un valor numérico finito para el IV.** Este manejo de ceros es una **práctica estándar para hacer el cálculo de IV más robusto**.

Recordemos que para saber qué tan buena es una columna para predecir nuestro objetivo objetivo (si el cáncer es benigno o maligno). Un **IV alto significa que la columna es muy útil para predecir** y si tenemos el resultado infinito, como que al final no nos es tan util a primera vista, con esto concluyo en que el calculo del IV de manera optimizada es la mejor para esta situacion.

**Separar entrenamiento y prueba**

In [13]:
X = df[SELECTED_FEATURES]
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## B. Modelo de regresión logística + significancia + métricas

### Significancia estadística con statsmodels

In [78]:
X_train_sm = sm.add_constant(X_train.astype(float))
logit_model = sm.Logit(y_train, X_train_sm).fit()
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.073760
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                  class   No. Observations:                  512
Model:                          Logit   Df Residuals:                      502
Method:                           MLE   Df Model:                            9
Date:                Sat, 03 May 2025   Pseudo R-squ.:                  0.8842
Time:                        20:36:48   Log-Likelihood:                -37.765
converged:                       True   LL-Null:                       -326.13
Covariance Type:            nonrobust   LLR p-value:                2.070e-118
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                         -10.0640      1.408     -7.150      0.000   

### Modelo con sklearn

In [15]:
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [16]:
print("\n--- Métricas Regresión Logística ---")
print("Precisión:", metrics.accuracy_score(y_test, y_pred_lr))
print(metrics.classification_report(y_test, y_pred_lr))


--- Métricas Regresión Logística ---
Precisión: 0.9532163742690059
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       103
           1       0.98      0.90      0.94        68

    accuracy                           0.95       171
   macro avg       0.96      0.94      0.95       171
weighted avg       0.95      0.95      0.95       171



## C. Modelo SVM y comparación

In [80]:
from sklearn.preprocessing import StandardScaler

In [81]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [82]:
svm = SVC()
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)

In [83]:
print("\n--- Métricas SVM ---")
print("Precisión SVM:", metrics.accuracy_score(y_test, y_pred_svm))
print("Reporte de clasificación SVM:\n", metrics.classification_report(y_test, y_pred_svm))


--- Métricas SVM ---
Precisión SVM: 0.9590643274853801
Reporte de clasificación SVM:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97       103
           1       0.97      0.93      0.95        68

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171



## **Comparación**

In [21]:
print("Comparación de modelos:")
print(f"Logistic Regression Accuracy: {metrics.accuracy_score(y_test, y_pred_lr):.4f}")

Comparación de modelos:
Logistic Regression Accuracy: 0.9532


In [22]:
print(f"SVM Accuracy: {metrics.accuracy_score(y_test, y_pred_svm):.4f}")

SVM Accuracy: 0.9474


## Pytest ZONE

In [86]:
%%writefile main_module.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

def clean_data(df):
    """
    Clean the dataset by handling missing values, converting types, 
    and adjusting the target column.

    input:
    - df: Raw dataframe

    output:
    - df: Cleaned dataframe
    """
    df = df.replace('?', np.nan)
    df = df.dropna()
    df['bare_nuclei'] = df['bare_nuclei'].astype(int)
    df['class'] = df['class'].apply(lambda x: 1 if x == 4 else 0)
    df = df.drop(columns=['id'])
    return df

def calculate_iv_optimized(df: pd.DataFrame, target: str) -> pd.Series:
    """
    Calcula el Information Value (IV) para todas las columnas/características en el dataset.

    Entrada:
    - df: DataFrame (pd.DataFrame) limpio con columnas y variable objetivo binaria (0/1).
    - target: Nombre de la columna objetivo (str).

    Salida:
    - pd.Series: Valores de IV por columna/característica, ordenados descendentemente.
    """
    def woe_iv_optimized_single_feature(df: pd.DataFrame, feature: str, target: str) -> float:
        counts = df.groupby([feature, target]).size()
        count_table = counts.unstack(fill_value=0)

        if 0 not in count_table.columns:
             count_table[0] = 0
        if 1 not in count_table.columns:
             count_table[1] = 1

        count_table = count_table[[0, 1]]
        count_table.columns = ['NonEvent', 'Event']

        total_event = count_table['Event'].sum()
        total_non_event = count_table['NonEvent'].sum()

        if total_event == 0 or total_non_event == 0:
            return 0.0

        epsilon = 1e-6
        count_table['Dist_Event'] = count_table['Event'] / total_event
        count_table['Dist_NonEvent'] = count_table['NonEvent'] / total_non_event

        count_table['Dist_Event'] = count_table['Dist_Event'].replace(0, epsilon)
        count_table['Dist_NonEvent'] = count_table['Dist_NonEvent'].replace(0, epsilon)

        with np.errstate(divide='ignore', invalid='ignore'):
             count_table['WOE'] = np.log(count_table['Dist_Event'] / count_table['Dist_NonEvent'])

        count_table.replace([np.inf, -np.inf], 0, inplace=True)

        count_table['IV'] = (count_table['Dist_Event'] - count_table['Dist_NonEvent']) * count_table['WOE']

        feature_iv = count_table['IV'].sum()

        return feature_iv

    iv_dict = {col: woe_iv_optimized_single_feature(df, col, target)
               for col in df.columns if col != target}

    return pd.Series(iv_dict).sort_values(ascending=False)

def select_strong_predictors(iv_series, threshold=0.02):
    """
    Select predictors with strong IV above the threshold.

    input:
    - iv_series: Series with IV per variable
    - threshold: Minimum IV to consider strong

    output:
    - list: Selected features
    """
    return iv_series[iv_series >= threshold].index.tolist()


Overwriting main_module.py


**Validando...**

In [87]:
%%writefile test_data.py

import pytest
import pandas as pd
from main_module import clean_data, calculate_iv_optimized, select_strong_predictors

@pytest.fixture
def sample_df():
    """
    Proporciona un dataframe de ejemplo limpio para pruebas.
    """
    data = {
        'clump_thickness': [1, 2, 3, 4],
        'uniformity_cell_size': [1, 2, 1, 2],
        'bare_nuclei': [1, 2, 3, 4],
        'class': [0, 1, 0, 1],
        'id': [123, 456, 789, 101]
    }
    return pd.DataFrame(data)

def test_clean_data(sample_df):
    """
    Prueba la limpieza de datos, asegurando que la columna 'id' se elimine.
    """
    cleaned = clean_data(sample_df)
    assert 'id' not in cleaned.columns
    assert cleaned['bare_nuclei'].dtype == int
    assert set(cleaned['class'].unique()).issubset({0, 1})

def test_calculate_iv_optimized(sample_df):
    """
    Prueba el cálculo de IV asegurando que se obtengan valores positivos.
    """
    df = clean_data(sample_df)
    ivs = calculate_iv_optimized(df, 'class')
    assert all(iv >= 0 for iv in ivs)

def test_select_strong_predictors(sample_df):
    """
    Prueba la selección de variables con IV mayor al umbral.
    """
    df = clean_data(sample_df)
    ivs = calculate_iv_optimized(df, 'class')
    selected = select_strong_predictors(ivs, threshold=0.0)
    assert len(selected) > 0

Overwriting test_data.py


In [88]:
!pytest test_data.py -v


platform win32 -- Python 3.12.5, pytest-8.3.5, pluggy-1.5.0 -- C:\Users\AzShet\AppData\Local\Programs\Python\Python312\python.exe
cachedir: .pytest_cache
rootdir: c:\Users\AzShet\Documents\Jupyter_LAB\jupyter_projects\5to_ciclo\DataMining\lab7
plugins: anyio-4.3.0, dash-3.0.1
[1mcollecting ... [0mcollected 3 items

test_data.py::test_clean_data [32mPASSED[0m[32m                                     [ 33%][0m
test_data.py::test_calculate_iv_optimized [32mPASSED[0m[32m                         [ 66%][0m
test_data.py::test_select_strong_predictors [32mPASSED[0m[32m                       [100%][0m

