In [91]:
import io
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from urllib.request import urlopen
from urllib.error import URLError

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, roc_curve,
                             precision_recall_fscore_support)

import statsmodels.api as sm

pd.set_option('display.max_columns', 100)
np.set_printoptions(suppress=True)

## EXPLORACIÓN DE LOS DATOS



In [66]:
#https://www.kaggle.com/datasets/erdemtaha/cancer-data
df = pd.read_csv('/content/Cancer_Data.csv')
r, c = df.shape
print(f'Filas:\t  {r}\nColumnas: {c}')

Filas:	  569
Columnas: 33


Dataset Information:

Target Variable (y):

Diagnosis (M = malignant, B = benign)

Ten features (X) are computed for each cell nucleus:

- radius (mean of distances from center to points on the perimeter)
- texture (standard deviation of gray-scale values)
- perimeter
- area
- smoothness (local variation in radius lengths)
- compactness (perimeter^2 / area - 1.0)
- concavity (severity of concave portions of the contour)
- concave points (number of concave portions of the contour)
- symmetry
- fractal dimension (coastline approximation - 1)

For each characteristic three measures are given:

a. Mean

b. Standard error

c. Largest/ Worst

In [82]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [83]:
data_dict = pd.DataFrame({
  'Columna': df.columns,
  'Tipo': df.dtypes.astype(str),
  'Valores_Nulos': df.isnull().sum(),
  'Valores_Únicos': df.nunique()
})
data_dict

Unnamed: 0,Columna,Tipo,Valores_Nulos,Valores_Únicos
diagnosis,diagnosis,object,0,2
radius_mean,radius_mean,float64,0,456
texture_mean,texture_mean,float64,0,479
perimeter_mean,perimeter_mean,float64,0,522
area_mean,area_mean,float64,0,539
smoothness_mean,smoothness_mean,float64,0,474
compactness_mean,compactness_mean,float64,0,537
concavity_mean,concavity_mean,float64,0,537
concave points_mean,concave points_mean,float64,0,542
symmetry_mean,symmetry_mean,float64,0,432


### - Limpieza de dataset

Eliminamos la columna Unnamed: 32, ya que no tiene datos en todos las filas. Asimismo, eliminamos duplicados.

In [71]:

df = df.drop(['Unnamed: 32','id'], axis=1)

In [84]:
# Eliminar duplicados
df = df.drop_duplicates()

## ¿Está balanciado nuestro dataset?

In [76]:
total = len(df)
is_balanced = df['diagnosis'].value_counts()
is_balanced = is_balanced.to_frame()
is_balanced['percentage'] = round((is_balanced['count'] / total) * 100, 2)
is_balanced

Unnamed: 0_level_0,count,percentage
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
B,357,62.74
M,212,37.26


In [77]:
y = df['diagnosis'].map({'B': 0, 'M': 1})
X = df.drop(['diagnosis'], axis=1)
print('Distribución de y:', np.bincount(y))
X.describe().T


Distribución de y: [357 212]


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
fractal_dimension_mean,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744



## Features Selection

In [78]:
# Ranking univariado por |correlación| con y
corrs = X.apply(lambda col: np.corrcoef(col, y)[0,1])
rank_univar = corrs.abs().sort_values(ascending=False)
rank_univar

Unnamed: 0,0
concave points_worst,0.793566
perimeter_worst,0.782914
concave points_mean,0.776614
radius_worst,0.776454
perimeter_mean,0.742636
area_worst,0.733825
radius_mean,0.730029
area_mean,0.708984
concavity_mean,0.69636
concavity_worst,0.65961


## Creación de nuevos Feature

In [89]:
X_eng = X.copy()
X_eng['compactness_ratio'] = (X['compactness_mean'] / (X['concavity_mean'] + 1e-6))
X_eng['area_perimeter_ratio'] = X['area_mean'] / (X['perimeter_mean'] + 1e-6)

X_eng['large_radius'] = (X['radius_mean']>14).astype(int)
X_eng['texture_avg'] = (X['texture_mean']+ X['texture_mean']) / 2

X_eng.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
fractal_dimension_mean,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


## Estructurando Dataset

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_eng, y, test_size=0.25, random_state=42, stratify=y)

base_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=500, solver='lbfgs'))
])

base_pipe.fit(X_train, y_train)
y_pred = base_pipe.predict(X_test)
y_proba = base_pipe.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97        90
           1       0.98      0.91      0.94        53

    accuracy                           0.96       143
   macro avg       0.96      0.95      0.95       143
weighted avg       0.96      0.96      0.96       143

Confusion matrix:
 [[89  1]
 [ 5 48]]
AUC-ROC: 0.9966457023060797


##

In [99]:
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_sm = sm.add_constant(X_train_s)
logit = sm.Logit(y_train, X_sm)
res = logit.fit(disp=False)
print(res.summary())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


LinAlgError: Singular matrix