**Librerías**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

# **Obtención de datos**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Los datos se importan desde una carpeta de google drive
data = pd.read_csv('/content/drive/MyDrive/Curso Data Science/Desafío 13/Set de datos/data.csv')

# **Limpieza y preparación de datos**

In [None]:
#Visualización de los datos tal como fueron descargados de la fuente
data.head()

In [6]:
#Preparación de los datos para su posterior análisis
data = data.drop(columns = ["id"]) #no es una columna que sirva para el análisis
data.rename(columns = {"diagnosis": "diagnóstico","radius_mean":"radio",	"texture_mean":"textura", "perimeter_mean":"perímetro",	"area_mean":"área",
                       "smoothness_mean":"suavidad", "compactness_mean":"compactibilidad",	"concavity_mean":"concavidad", "concave points_mean":"puntos_cóncavos",
                       "symmetry_mean":"simetría","fractal_dimension_mean":"dimensión_fractal"}, inplace = True)

In [7]:
data.diagnóstico.unique()

array(['M', 'B'], dtype=object)

In [8]:
#Mapeo de la variable diagnóstico para que sea cuantitativa y poder aplicar los modelos de machine learning
data["diagnóstico"] = data["diagnóstico"].map({'M':1,'B':0})

In [None]:
data.info()

**Datos ausentes**

In [None]:
data.isnull().sum()

In [None]:
len(data)

In [12]:
data.describe()

Unnamed: 0,diagnóstico,radio,textura,perímetro,área,suavidad,compactibilidad,concavidad,puntos_cóncavos,simetría,dimensión_fractal
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.372583,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798
std,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706
min,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996
25%,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577
50%,0.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154
75%,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612
max,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744


**Outliers**

In [None]:
fig, ax = plt.subplots(9,2, figsize = (10,35))

#RADIO
ax[0,0].hist(data["radio"], bins = 100, color = "#F0534D")
ax[0,0].set(title = "Histograma",
       xlabel = "radio [mm]",
       ylabel = "Frecuencia")
x = data["radio"]
ax[0,1].boxplot(x.values, vert = False)
ax[0,1].set(title = "Boxplot",
       xlabel = "radio [mm]")

#TEXTURA
ax[1,0].hist(data["textura"], bins = 100, color = "#E455D0")
ax[0,0].set(xlabel = "textura", ylabel = "Frecuencia")
ax[1,1].boxplot(x.values, vert = False)
ax[1,1].set(xlabel = "textura")

#PERÍMETRO
ax[2,0].hist(data["perímetro"], bins = 100, color = "#FA508E")
ax[2,0].set(xlabel = "perímetro [mm]",
       ylabel = "frecuencia")
x = data["perímetro"]
x.dropna(inplace = True) #Se eliminan los registros vaciós
ax[2,1].boxplot(x.values, vert = False)
ax[2,1].set(xlabel = "perímetro [mm]")

#ÁREA
ax[3,0].hist(data["área"], bins = 100, color = "#D850FA")
ax[3,0].set(xlabel = "área [mm]",
       ylabel = "frecuencia")
x = data["área"]
ax[3,1].boxplot(x.values, vert = False)
ax[3,1].set(xlabel = "área [mm]")

#COMPACTIBILIDAD
ax[4,0].hist(data["compactibilidad"], bins = 100, color = "#A34DF0")
ax[4,0].set(xlabel = "compactibilidad",
       ylabel = "frecuencia")
x = data["compactibilidad"]
ax[4,1].boxplot(x.values, vert = False)
ax[4,1].set(xlabel = "compactibilidad")

#CONCAVIDAD
ax[5,0].hist(data["concavidad"], bins = 100, color = "#EF81C0")
ax[5,0].set(xlabel = "concavidad [mm]",
       ylabel = "frecuencia")
x = data["concavidad"]
ax[5,1].boxplot(x.values, vert = False)
ax[5,1].set(xlabel = "concavidad [mm]")

#PUNTOS_CÓNCAVOS
ax[6,0].hist(data["puntos_cóncavos"], bins = 100, color = "#E450FA")
ax[6,0].set(xlabel = "puntos_cóncavos [mm]",
       ylabel = "frecuencia")
#Análisis de la variable duración
x = data["puntos_cóncavos"]
ax[6,1].boxplot(x.values, vert = False)
ax[6,1].set(xlabel = "puntos_cóncavos")

#SIMETRÍA
ax[7,0].hist(data["simetría"], bins = 100, color = "#FA2ADD")
ax[7,0].set(xlabel = "simetría",
       ylabel = "frecuencia")
x = data["simetría"]
ax[7,1].boxplot(x.values, vert = False)
ax[7,1].set(title = "Boxplot de la variable 'simetría'",
       xlabel = "simetría [mm]")

#DIMENSIÓN FRACTAL
ax[8,0].hist(data["dimensión_fractal"], bins = 100, color = "#E354D0")
ax[8,0].set(xlabel = "dimensión fractal",
       ylabel = "frecuencia")
x = data["dimensión_fractal"]
ax[8,1].boxplot(x.values, vert = False)
ax[8,1].set(xlabel = "simetría [mm]")

plt.show()

# **EDA**

**Encoding**

In [26]:
features_mean=list(data.columns[1:11])
#Se separan los datos en función de su diagnóstico
dfM=data[data["diagnóstico"] == 1]
dfB=data[data["diagnóstico"] == 0]

In [None]:
#Stack the data
plt.rcParams.update({"font.size": 8})
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(8,10))
axes = axes.ravel()
for idx,ax in enumerate(axes):
    ax.figure
    binwidth= (max(data[features_mean[idx]]) - min(data[features_mean[idx]]))/50
    ax.hist([dfM[features_mean[idx]],dfB[features_mean[idx]]], bins=np.arange(min(data[features_mean[idx]]), max(data[features_mean[idx]]) + binwidth, binwidth) , alpha=0.8,stacked=True, label=["M","B"],color=["#943424","#239464"])
    ax.legend(loc="upper right")
    ax.set_title(features_mean[idx])
plt.tight_layout()
plt.show()

Creación del modelo

In [32]:
traindf, testdf = train_test_split(data, test_size = 0.3)

In [30]:
#Función genérica para el modelo de clasificación y para evaluar su performance
def classification_model(model, data, predictors, outcome):
  #Modelo:
  model.fit(data[predictors],data[outcome])

  #predicciones en training set:
  predictions = model.predict(data[predictors])

  #Accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print("Accuracy : %s" % "{0:.3%}".format(accuracy))

  #K-fold validación cruzada
  X = np.array(data)
  kf = KFold(n_splits=5, shuffle = True)

  error = []

  for train, test in kf.split(X):
    #Filtro train
    train_predictors = (data[predictors].iloc[train,:])

    #Objetivo
    train_target = data[outcome].iloc[train]

    #Entrenamiento
    model.fit(train_predictors, train_target)

    #Errores
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))

    print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

  #Ajuste del modelo
  model.fit(data[predictors],data[outcome])

In [None]:
predictor_var = ["radio","perímetro","área","puntos_cóncavos","concavidad"]
outcome_var="diagnóstico"
model=LogisticRegression()
classification_model(model,traindf,predictor_var,outcome_var)

In [34]:
predictor_var = ["radio"]
model=LogisticRegression()
classification_model(model,traindf,predictor_var,outcome_var)

Accuracy : 88.191%
Cross-Validation Score : 88.750%
Cross-Validation Score : 87.500%
Cross-Validation Score : 88.333%
Cross-Validation Score : 87.136%
Cross-Validation Score : 88.190%


In [None]:
predictor_var = ['radius_mean','perimeter_mean','area_mean','compactness_mean','concave points_mean']
model = DecisionTreeClassifier()
classification_model(model,traindf,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 90.000%
Cross-Validation Score : 87.500%
Cross-Validation Score : 86.250%
Cross-Validation Score : 88.422%
Cross-Validation Score : 89.978%


In [None]:
# Use all the features of the nucleus
predictor_var = features_mean
model = RandomForestClassifier(n_estimators=100,min_samples_split=25, max_depth=7, max_features=2)
classification_model(model, traindf,predictor_var,outcome_var)

Accuracy : 95.980%
Cross-Validation Score : 91.250%
Cross-Validation Score : 93.125%
Cross-Validation Score : 92.917%
Cross-Validation Score : 93.105%
Cross-Validation Score : 92.712%


In [None]:
#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictor_var).sort_values(ascending=False)
print(featimp)

concave points_mean       0.265121
perimeter_mean            0.230853
radius_mean               0.140601
concavity_mean            0.128710
area_mean                 0.107303
compactness_mean          0.067544
texture_mean              0.031740
smoothness_mean           0.014222
symmetry_mean             0.007971
fractal_dimension_mean    0.005935
dtype: float64


In [None]:
# Using top 5 features
predictor_var = ['concave points_mean','area_mean','radius_mean','perimeter_mean','concavity_mean',]
model = RandomForestClassifier(n_estimators=100, min_samples_split=25, max_depth=7, max_features=2)
classification_model(model,traindf,predictor_var,outcome_var)

Accuracy : 94.975%
Cross-Validation Score : 92.500%
Cross-Validation Score : 91.875%
Cross-Validation Score : 92.500%
Cross-Validation Score : 92.793%
Cross-Validation Score : 92.715%


In [None]:
predictor_var =  ['radius_mean']
model = RandomForestClassifier(n_estimators=100)
classification_model(model, traindf,predictor_var,outcome_var)

Accuracy : 97.739%
Cross-Validation Score : 85.000%
Cross-Validation Score : 81.250%
Cross-Validation Score : 80.833%
Cross-Validation Score : 81.511%
Cross-Validation Score : 80.399%


In [None]:
# Use all the features of the nucleus
predictor_var = features_mean
model = RandomForestClassifier(n_estimators=100,min_samples_split=25, max_depth=7, max_features=2)
classification_model(model, testdf,predictor_var,outcome_var)