# Preparación de los datos

## Cargamos el Dataset de Iris

In [1]:
import pandas as pd
df = pd.read_csv("../datasets/Iris.csv")

print(df.shape)
df.head().T

(150, 6)


Unnamed: 0,0,1,2,3,4
Id,1,2,3,4,5
SepalLengthCm,5.1,4.9,4.7,4.6,5.0
SepalWidthCm,3.5,3.0,3.2,3.1,3.6
PetalLengthCm,1.4,1.4,1.3,1.5,1.4
PetalWidthCm,0.2,0.2,0.2,0.2,0.2
Species,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa


## Usamos solo las columnas relacionadas con los pétalos

In [2]:
feature_columns = ['PetalLengthCm','PetalWidthCm']
X = df[feature_columns].values
y = df['Species'].values

## Separamos en conjuntos de entrenamiento (70%) y test (30%)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1, stratify = y)

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

105
45
105
45


## Normalizamos con un escalado estándard

In [4]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

print(X_train[0:5])
print(X_train_std[0:5])
print(X_test[0:5])
print(X_test_std[0:5])

[[1.4 0.2]
 [1.7 0.2]
 [5.3 2.3]
 [5.7 2.1]
 [1.2 0.2]]
[[-1.33269725 -1.30380366]
 [-1.16537974 -1.30380366]
 [ 0.84243039  1.44465434]
 [ 1.0655204   1.18289644]
 [-1.44424226 -1.30380366]]
[[5.4 2.3]
 [1.7 0.4]
 [1.4 0.3]
 [4.5 1.7]
 [4.4 1.4]]
[[ 0.89820289  1.44465434]
 [-1.16537974 -1.04204575]
 [-1.33269725 -1.17292471]
 [ 0.39625036  0.65938063]
 [ 0.34047786  0.26674377]]


In [5]:
def train_model(model):
    return model.fit(X_train, y_train)


In [6]:
def predict_model(model):
    y_pred = model.predict(X_test)
    return round((y_test == y_pred).mean(), 2)


# Configuración y entrenamiento del modelo de regresión logística

In [7]:
from sklearn.linear_model import LogisticRegression

logistic_regression_model = train_model(LogisticRegression(random_state=1))
predict_model(logistic_regression_model)

0.98

El 98% de las predicciones realizadas por el modelo son correctas en el conjunto de pruebas.

# Configuración y entrenamiento del modelo SVM

In [8]:
from sklearn.svm import SVC

svm_model = train_model(SVC(kernel='linear',C=1.0, random_state=1, probability=True))
predict_model(svm_model)

0.98

El 98% de las predicciones realizadas por el modelo son correctas en el conjunto de pruebas.

# Configuración y entrenamiento del modelo de árboles de decisión

In [9]:
from sklearn.tree import DecisionTreeClassifier

tree_model = train_model(DecisionTreeClassifier(criterion="gini", max_depth=4, random_state=1))
predict_model(tree_model)

0.98

El 98% de las predicciones realizadas por el modelo son correctas en el conjunto de pruebas.

# Configuración y entrenamiento del modelo de árboles de KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = train_model(KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski'))
predict_model(knn_model)

0.98

El 98% de las predicciones realizadas por el modelo son correctas en el conjunto de pruebas.

In [10]:
import pickle

def save_model(model_name, model):
    with open(f'../models/iris-{model_name}.pck', 'wb') as file:
        pickle.dump(model, file)  

def load_model(model_name):
    with open(f'../models/iris-{model_name}.pck', 'rb') as file:
        return pickle.load(file)

In [13]:
save_model("logistic_regression_model", logistic_regression_model)
save_model("svm_model", svm_model)
save_model("tree_model", tree_model)
save_model("knn_model", knn_model)