#Clasificación 

Este notebook es para ejemplificar la teoría de clasificación.

Vamos a utilizar el dataset de `Diabetes` que viene en este [link](https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv) de todas formas en la pestaña de abajo se hace un método get para obtener el csv desde dicho link 👇

## 1. Cargamos el dataset

In [None]:
!wget https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv

In [None]:
import pandas as pd
data = pd.read_csv('diabetes.csv')
data.head(3)

## 2. Analizamos dataset

In [None]:
data.info()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(data['Pregnancies'],data['Outcome'],c=data['Outcome']);

In [None]:
import seaborn as sns
sns.set()
# sns.pairplot(data,hue='Outcome');

In [None]:
X = data.drop(columns='Outcome')
y = data['Outcome']

In [None]:
plt.figure(figsize=(10,7))
plt.hist(y,color='green')
plt.xlabel("No Diabetes/Si Diabetes")
plt.title("Histograma de valores de y");

## 3. Separamos los datos

In [None]:
#Separamos los datos de forma estratificada para mantener la proporcion de distribución de Y
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.25, stratify= y, random_state=42)
print(f"Set de entrenamiento {Xtrain.shape}, {ytrain.shape}")
print(f"Set de testeo {Xtest.shape}, {ytest.shape}")

## 4. Entrenamos el modelo de Regresión Logística

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
model_logreg = make_pipeline(StandardScaler(),LogisticRegression())
model_logreg.fit(Xtrain,ytrain)

## 5. Evaluamos el modelo de regresión logistica

In [None]:
model_logreg.predict_proba(Xtest)

In [None]:
print(f"R2 en entrenamiento: {model_logreg.score(Xtrain,ytrain)}")
print(f"R2 en testeo: {model_logreg.score(Xtest,ytest)}")

In [None]:
from sklearn import metrics as ms

In [None]:
y_pred_logreg = model_logreg.predict(Xtest)
confusion_matrix = ms.confusion_matrix(y_true = ytest, y_pred = y_pred_logreg)
sns.heatmap(confusion_matrix,annot=True,cbar=False)
plt.xlabel("Predicted labels")
plt.ylabel("Ground truth labels")
plt.title("Confusion Matrix");

In [None]:
# Calculamos las metricas más usuales para controlar la performance del modelo
# Estas metricas son: Accuracy, Precision, Recall, F1-score
# Las almacenamos en un diccionario

acc_logreg = ms.accuracy_score(y_true = ytest, y_pred = y_pred_logreg)
precision_logreg = ms.precision_score(y_true = ytest, y_pred = y_pred_logreg)
recall_logreg = ms.recall_score(y_true = ytest, y_pred = y_pred_logreg)
f1_logreg = ms.f1_score(y_true = ytest, y_pred = y_pred_logreg)

logreg_metrics = {"Accuracy":acc_logreg,
                  "Precision":precision_logreg,
                  "Recall":recall_logreg,
                  "F1":f1_logreg}
logreg_metrics


In [None]:
from matplotlib.font_manager import MSFontDirectories
# Ahora ploteamos la curva ROC (Receiver Operating Characteristic)
y_pred_proba = model_logreg.predict_proba(Xtest)[::,1]
fpr, tpr, _ = ms.roc_curve(ytest,  y_pred_proba)
auc = ms.roc_auc_score(ytest, y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr,c='purple',label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.title("Curva ROC-AUC");

> **Nota:** para más información de la curva ROC-AUC ir a este [link](https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/)

## 6. Entrenamos modelo de Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
model_naive = make_pipeline(MinMaxScaler(),MultinomialNB())
model_naive.fit(Xtrain,ytrain)

## 7. Evaluamos el modelo de Naive Bayes

In [None]:
print(f"R2 en entrenamiento: {model_naive.score(Xtrain,ytrain)}")
print(f"R2 en testeo: {model_naive.score(Xtest,ytest)}")

In [None]:
y_pred_naive = model_naive.predict(Xtest)
confusion_matrix = ms.confusion_matrix(y_true = ytest, y_pred = y_pred_naive)
sns.heatmap(confusion_matrix,annot=True,cbar=False)
plt.xlabel("Predicted labels")
plt.ylabel("Ground truth labels")
plt.title("Confusion Matrix");

In [None]:
# Calculamos las metricas más usuales para controlar la performance del modelo
# Estas metricas son: Accuracy, Precision, Recall, F1-score
# Las almacenamos en un diccionario

acc_naive = ms.accuracy_score(y_true = ytest, y_pred = y_pred_naive)
precision_naive = ms.precision_score(y_true = ytest, y_pred = y_pred_naive,zero_division=0)
recall_naive = ms.recall_score(y_true = ytest, y_pred = y_pred_naive,zero_division=0)
f1_naive = ms.f1_score(y_true = ytest, y_pred = y_pred_naive,zero_division=0)

naive_metrics = {"Accuracy":acc_naive,
                  "Precision":precision_naive,
                  "Recall":recall_naive,
                  "F1":f1_naive}
naive_metrics

In [None]:
from matplotlib.font_manager import MSFontDirectories
# Ploteamos la curva ROC (Receiver Operating Characteristic)
y_pred_proba = model_naive.predict_proba(Xtest)[::,1]
fpr, tpr, _ = ms.roc_curve(ytest,  y_pred_proba)
auc = ms.roc_auc_score(ytest, y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr,c='purple',label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.title("Curva ROC-AUC");

## 8. Entrenamos el modelo Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
model_random = make_pipeline(MinMaxScaler(),RandomForestClassifier())
model_random.fit(Xtrain,ytrain)

## 9. Evaluamos el modelo Random Forest

In [None]:
print(f"R2 en entrenamiento: {model_random.score(Xtrain,ytrain)}")
print(f"R2 en testeo: {model_random.score(Xtest,ytest)}")

In [None]:
y_pred_random = model_random.predict(Xtest)
confusion_matrix = ms.confusion_matrix(y_true = ytest, y_pred = y_pred_random)
sns.heatmap(confusion_matrix,annot=True,cbar=False)
plt.xlabel("Predicted labels")
plt.ylabel("Ground truth labels")
plt.title("Confusion Matrix");

In [None]:
# Calculamos las metricas más usuales para controlar la performance del modelo
# Estas metricas son: Accuracy, Precision, Recall, F1-score
# Las almacenamos en un diccionario

acc_random = ms.accuracy_score(y_true = ytest, y_pred = y_pred_random)
precision_random = ms.precision_score(y_true = ytest, y_pred = y_pred_random)
recall_random = ms.recall_score(y_true = ytest, y_pred = y_pred_random)
f1_random = ms.f1_score(y_true = ytest, y_pred = y_pred_random)

random_metrics = {"Accuracy":acc_random,
                  "Precision":precision_random,
                  "Recall":recall_random,
                  "F1":f1_random}
random_metrics

In [None]:
from matplotlib.font_manager import MSFontDirectories
# Ploteamos la curva ROC (Receiver Operating Characteristic)
y_pred_proba = model_random.predict_proba(Xtest)[::,1]
fpr, tpr, _ = ms.roc_curve(ytest,  y_pred_proba)
auc = ms.roc_auc_score(ytest, y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr,c='purple',label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.title("Curva ROC-AUC");

## 10. Comparamos los tres modelos

In [None]:
import pandas as pd
pd.DataFrame({"Logistic":logreg_metrics,"NBayes":naive_metrics,"RandomF":random_metrics}).plot(kind='bar',figsize=(10,7));