# Proyecto: Clasificación con el dataset Iris


## 1. Liberias requeridas


In [None]:
# Librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns

# Settings
%matplotlib inline
sns.set()

## 2. Cargar dataset Iris


In [None]:
# Cargar dataset Iris
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='species')
df = pd.concat([X, y], axis=1)
df['species_name'] = df['species'].map(dict(enumerate(iris.target_names)))
df.head()

## 3. Identificación y preparación de los datos


In [None]:
df.info()
df.describe().T

# Comprobación de nulos y duplicados
print('Nulos por columna:\n', df.isnull().sum())
print('\nDuplicados:', df.duplicated().sum())

# Gráficos exploratorios
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,6))
sns.pairplot(df, hue='species_name', vars=iris.feature_names)
plt.suptitle('Pairplot de características del Iris', y=1.02)

## 4. Procesos limpieza, selección, normalización

In [None]:
# Variables utilizadas
feature_names = iris.feature_names
print(feature_names)

# División de datos: entrenamiento (70%) y prueba (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_train.shape, X_test.shape

# Normalización (Z-score)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Modelado, entrenamiento y prueba


In [None]:
# 1) Regresión Logística (baseline)
lr = LogisticRegression(max_iter=200, multi_class='auto', solver='lbfgs', random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
acc_lr = accuracy_score(y_test, y_pred_lr)
print('Accuracy Logistic Regression:', acc_lr)
print('\nClassification report:\n', classification_report(y_test, y_pred_lr, target_names=iris.target_names))

# Matriz de confusión - Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(6,4))
sns.heatmap(cm_lr, annot=True, fmt='d', xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Confusion matrix - Logistic Regression')
plt.xlabel('Predicted'); plt.ylabel('True')

# 2) Árbol de decisión (sin escalar para interpretación)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print('Accuracy Decision Tree:', acc_dt)
print('\nClassification report:\n', classification_report(y_test, y_pred_dt, target_names=iris.target_names))

# Visualización del árbol
plt.figure(figsize=(12,6))
plot_tree(dt, feature_names=feature_names, class_names=iris.target_names, filled=True, rounded=True)
plt.title('Árbol de Decisión entrenado')

# Matriz de confusión - Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(6,4))
sns.heatmap(cm_dt, annot=True, fmt='d', xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Confusion matrix - Decision Tree')
plt.xlabel('Predicted'); plt.ylabel('True')

## 5. Búsqueda de hiperparámetros (GridSearch) para Decision Tree

In [None]:
params = {'max_depth': [2,3,4,5, None], 'min_samples_split':[2,3,4,5]}
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, cv=5)
gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_

# Evaluar el mejor modelo
best_dt = gs.best_estimator_
y_pred_best = best_dt.predict(X_test)
print('Accuracy Best Decision Tree:', accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best, target_names=iris.target_names))


results = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Decision Tree (GridSearch)'],
    'Accuracy': [acc_lr, acc_dt, accuracy_score(y_test, y_pred_best)]
}
import pandas as pd
pd.DataFrame(results)