In [13]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


data = load_breast_cancer()                                             # 1. Cargar el dataset de scikit-learn
df = pd.DataFrame(data.data, columns=data.feature_names)                # 2. Preprocesamiento , se seleccionan 6 columnas
features = ['mean radius', 'mean texture', 'mean perimeter', 
            'mean area', 'mean smoothness', 'mean compactness']
X = df[features]
y = data.target                 # 1: Benigno, 0: Maligno  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)  # 3. Dividiendo en X_train,X_test,y_train,y_test 
u = X_train.mean()
s = X_train.std()
X_train_norm = (X_train - u) / s                                         # 4 . Escalamos , (x- media)/desviacion
X_test_norm = (X_test - u) / s 
def calcular_vif(df_input):                                              # 5. Analisis VIF 
    vifs = {}
    X_mat = df_input.values
    for i, col in enumerate(df_input.columns):
        y_temp = X_mat[:, i]
        X_temp = np.delete(X_mat, i, axis=1)
        X_temp = sm.add_constant(X_temp)
        res = sm.OLS(y_temp, X_temp).fit()
        vifs[col] = 1 / (1 - res.rsquared)
    return pd.Series(vifs)
print("Análisis VIF previo:")
print(calcular_vif(X_train_norm))    
X_train_sm = sm.add_constant(X_train_norm)                               # 6. Construccion del modelo
model = sm.Logit(y_train, X_train_sm)                                    # theta 
result = model.fit()                                                     # Ajuste Newton-Raphson
print(result.summary())                                                  # 7. Resultados estadisticos
X_test_sm = sm.add_constant(X_test_norm)                                 #  8. Evaluando con datos de prueba X_test normalizado
probs = result.predict(X_test_sm)
y_pred = (probs >= 0.5).astype(int)                                      # El umbral es 0.5 
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))                             # Reporte de clasificacion
# Odds Ratios
odds_ratios = np.exp(result.params)                                      # 9. Intrepretacion de Coeficientes (Odds Ratios)
print("\nOdds Ratios:")
print(odds_ratios)
print(f"PREGUNTAS DE REFLEXION\n")
#. sabemos que p-valor (P>∣z∣) ,Es la probabilidad de que el coeficiente sea realmente cero (es decir, que la variable no aporte ).
# Si p-valor < 0.05: Rechazamos la idea de que la variable no aporta. Todo lo contrario es <estadísticamente significativa>.
# Si p-valor > 0.05:No tenemos pruebas suficientes para decir que esa variable ayuda al modelo, posiblemente porque el VIF es muy alto y otra variable 
# le está robando su importancia.
# De modo que segun el grafico en la columna P>|z|, los preedictores con p<0.05 son MEAN TEXTURE y MEAN SMOOTHNESS
# los cuales son los mas significativos, también mean area y raidus en cierta medida\n\n")

#. VIF indica que los predictores RADIO y PERIMETRO están tan correlacionadas entre sí que el modelo no puede distinguir el efecto individual 
# de cada una., lo cual es obvio pues el perimetro se calcula usando el radio, de modo que MEAN RADIUS 
# tiene un VIF muy alto pues las variables como area y permitro dependen de ella

# . "El Coeficiente (theta): Es el cambio en el <logaritmo de la oportunidad> (log-odds =  log(p/(1- p)).
# Si es positivo, aumenta la probabilidad de ser Benigno (clase 1).Si es negativo, aumenta la probabilidad de ser Maligno (clase 0).")

Análisis VIF previo:
mean radius         1460.924793
mean texture           1.144166
mean perimeter      1737.092968
mean area             44.049377
mean smoothness        2.064559
mean compactness      11.866719
dtype: float64
Optimization terminated successfully.
         Current function value: 0.159583
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  455
Model:                          Logit   Df Residuals:                      448
Method:                           MLE   Df Model:                            6
Date:                Fri, 30 Jan 2026   Pseudo R-squ.:                  0.7581
Time:                        10:35:44   Log-Likelihood:                -72.610
converged:                       True   LL-Null:                       -300.17
Covariance Type:            nonrobust   LLR p-value:                 3.887e-95
                       coef    std er

In [3]:
import numpy as np 
a = np.arange(0,5,1)
print(a)

[0 1 2 3 4]


In [28]:
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import random as rd


 
def separar_datos(X,y,p_): # l 8 columnas , n_sub 5 predictores
    f = []
    k,_ = X.shape
    l = np.arange(0,k,1)
    print(f"{k} , {int((1-p_)*k)}")
    while k>1:
        i = math.floor(k*rd.uniform(0,1))
        #print(f"i : {i}")
        l[k-1] , l[i] = l[i] , l[k-1]
        f.append(l[k-1])
        if (len(l) - k) + 1  == int((1-p_)*X.shape[0]):
            break
        k = k - 1
    X_train = np.zeros((len(f),X.shape[1]))
    y_train = np.zeros(len(f))
    for i in range(len(f)):
        X_train[i] = X[f[i]]
        y_train[i] = y[f[i]]
    resto = [i for i in l if i not in f]
    X_test = np.zeros((len(resto),X.shape[1]))
    y_test = np.zeros(len(resto))
    for i in range(len(resto)):
        X_test[i] = X[resto[i]]
        y_test[i] = y[resto[i]]
    return X_train,X_test,y_train,y_test
    
def sigmoide(z):
    e = np.e
    return 1/(1 + e**(-z))

def gradiente_logistica(X,y,theta):
    z = X @ theta
    #print(f"z :\n{z}")
    h_theta = sigmoide(z) 
    return X.T @ (h_theta - y)

def descenso_gradiente_logistica_L2(X,y,alpha=0.01,N=1000,lam =1):
    n , m = X.shape 
    theta = np.zeros(m)
    for _ in range(N):
        z = X @ theta
        h_theta = sigmoide(z)
        grad = X.T @ (h_theta - y)
        grad[0] = grad[0]/n
        for j in range(1,m):
            grad[j] = grad[j]/n + (lam/n)*theta[j]
        theta = theta - alpha * grad
    return theta

#H=XTWX  
#Wi=pi(1−pi)   con pi=σ(Xiθ)
#θnew=θ+(XTWX)−1XT(y−p)

def evaluar_modelo(X,y_real,theta):
    z = X @ theta
    probs = sigmoide(z)
    y_pred = (probs > 0.5).astype(int) #probs >0.5 ? y_pred = 1 : y_pred= 0
    tp = np.sum((y_real == 1)&(y_pred == 1)) # verdaderos positivos
    tn = np.sum((y_real == 0)&(y_pred == 0)) # verdaderos negativos
    fp = np.sum((y_real == 0)&(y_pred == 1)) # falsos positivos
    fn = np.sum((y_real == 1)&(y_pred == 0)) # falsos negativos
    accuracy = (tp + tn)/len(y_real)
    precision = tp/(tp + fp) if (tp + fp) > 0 else 0
    recall = tp/(tp + fn) if (tp + fn) > 0 else 0
    f1 = 2*(precision * recall)/(precision + recall) if (precision + recall) > 0 else 0
    return {"matriz":[[tn,fp],[fn,tp]],
            "accuracy":accuracy,
            "precision":precision,
            "recall":recall,
            "f1":f1}


def imprimir_metricas(metrics, nombres_clases=['-', '+']): 
    matriz = metrics['matriz']
    df_matriz = pd.DataFrame(
        matriz,
        index=nombres_clases,
        columns=['Pred 0', 'Pred 1']
    )
    print("Matriz de confusión:")
    print(df_matriz)
    print() 
    print("Métricas del modelo:")
    for metrica in ['accuracy', 'precision', 'recall', 'f1']:
        valor = metrics[metrica]
        print(f"{metrica}: {valor:.4f}")


def vif_(X_):
    cols = X_.columns
    vif = {}
    X_ = X_.values
    for i,col in enumerate(cols):
        y_r = X_[:,i]
        X_tmp = np.delete(X_,i,axis=1)
        m , n = X_tmp.shape
        X1 = np.ones((m,n+1))
        X1[:,1:] = X_tmp
        theta = np.linalg.pinv(X1.T@X1)@(X1.T@y_r) 
        y_p = X1@theta
        ssr = np.sum((y_r-y_p)**2)
        sst = np.sum((y_r - np.mean(y_r))**2)
        r2 = 1 - ssr/sst
        vif[col] = 1/(1-r2)
    return pd.Series(vif)

if __name__=='__main__':
    datos = load_breast_cancer()
    df = pd.DataFrame(datos.data,columns=datos.feature_names)
    features = ['mean radius', 'mean texture', 'mean perimeter', 
            'mean area', 'mean smoothness', 'mean compactness']
    X = df[features].values ; print(f"X\n{X}")
    y = datos.target ; print(f"y\n{y}")
    X_train, X_test, y_train, y_test = separar_datos(X,y,0.2)   #= train_test_split(X, y, test_size=0.20, random_state=42)  # 3. Dividiendo en X_train,X_test,y_train,y_test 
    u = X_train.mean(axis=0)
    s = X_train.std(axis=0)
    X_train_norm = (X_train - u) / s        #X−1μT , 1:matriz de unos   u:medias                 # 4 . Escalamos , (x- media)/desviacion
    X_test_norm = (X_test - u) / s 
    print(vif_(pd.DataFrame(X_train_norm,columns= features)))
    m ,n = X_train_norm.shape
    X1 = np.ones((m,n+1))
    X1[:,1:] = X_train_norm
    theta = descenso_gradiente_logistica_L2(X1,y_train)
    print(f"theta:\n{theta}")

X
[[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01]
 [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01]
 ...
 [1.660e+01 2.808e+01 1.083e+02 8.581e+02 8.455e-02 1.023e-01]
 [2.060e+01 2.933e+01 1.401e+02 1.265e+03 1.178e-01 2.770e-01]
 [7.760e+00 2.454e+01 4.792e+01 1.810e+02 5.263e-02 4.362e-02]]
y
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1