# Importe de Librerías

In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Carga de Datos

In [None]:
df_data= pd.read_csv("data/breast-cancer-wisconsin.data")
df_data

# Observación y Preprocesamiento

In [None]:
df_data.info()

In [None]:
duplicates = df_data.duplicated("ID_Number", keep=False)
print('duplicados',df_data[duplicates])
df_data=df_data.drop_duplicates(subset=["ID_Number"])
df_data

In [None]:
null_columns= df_data.columns[df_data.isnull().any()]
print(df_data[null_columns].isnull().sum())


In [None]:
Index_BareNuclei = df_data[df_data["Bare_Nuclei"]=="?"].index
Index_BareNuclei
df_data.drop(Index_BareNuclei,inplace=True)
df_data



In [None]:
corr=df_data.corr()
sn.heatmap(corr, annot=True)
plt.show()

In [None]:
df_data["Bare_Nuclei"]=df_data["Bare_Nuclei"].astype(np.int64)
df_data.info()

# MODELOS

### KVECINOS:

In [None]:
# Sin bucle plantilla basica
from sklearn.neighbors import KNeighborsClassifier

#from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X = df_data[['Clump_Thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape','Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei','Bland_Chromatin']]
y = df_data[['Class']]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=33)

for p in range(1,30):
    model = KNeighborsClassifier(n_neighbors=p)
   
    model.fit(X_train, y_train)

    print('p = ',p)
    print(f"Error training: {(1 - model.score(X_train, y_train)) * 100} %")
    print(f"Error test: {(1 - model.score(X_test, y_test)) * 100} %")
    print("")


In [None]:
model = KNeighborsClassifier(n_neighbors=6)
model.fit(X_train, y_train)
y_predict=model.predict(X_test)

In [None]:
def plot_confusion_matrix(X, y, model):
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=model.classes_)
    disp.plot()

    plt.show()


In [None]:
print("Training confusion matrix")
plot_confusion_matrix(X_train, y_train, model)
print("Test confusion matrix")
plot_confusion_matrix(X_test, y_test, model)

### ARBOL DE DECISIÓN:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X = df_data[['Clump_Thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape','Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei','Bland_Chromatin']]
y = df_data[['Class']]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=33)

for p in range(1,30):
    model = DecisionTreeClassifier(criterion="gini", max_depth=p, random_state=33)

    model.fit(X_train, y_train)

    print('p = ',p)
    print(f"Error training: {(1 - model.score(X_train, y_train)) * 100} %")
    print(f"Error test: {(1 - model.score(X_test, y_test)) * 100} %")
    print("")

In [None]:
model = DecisionTreeClassifier(criterion="gini", max_depth=5)
model.fit(X_train, y_train)
y_predict=model.predict(X_test)

In [None]:
print("Training confusion matrix")
plot_confusion_matrix(X_train, y_train, model)
print("Test confusion matrix")
plot_confusion_matrix(X_test, y_test, model)

### RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df_data[['Clump_Thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape','Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei','Bland_Chromatin']]
y = df_data[['Class']]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=33)

for p in range(1,30):
    model = RandomForestClassifier(max_depth=p)

    model.fit(X_train, y_train)

    print('p = ',p)
    print(f"Error training: {(1 - model.score(X_train, y_train)) * 100} %")
    print(f"Error test: {(1 - model.score(X_test, y_test)) * 100} %")
    print("")

In [None]:
model = RandomForestClassifier(max_depth=6)
model.fit(X_train, y_train)
y_predict=model.predict(X_test)

In [None]:
print("Training confusion matrix")
plot_confusion_matrix(X_train, y_train, model)
print("Test confusion matrix")
plot_confusion_matrix(X_test, y_test, model)

# **CONCLUSIÓN**

Se aplico 3 modelos distintos al data set, de los cuales se puede observar en la matriz de confusión que el modelo que nos proporciona menor cantidad de falsos negativos es el modelo Random Forest con profundidad 5 que en los datos de prueba nos arroja 2 falsos negativos.