In [4]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn import datasets as db
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA

In [6]:
def plot_confusion_matrix(cm, labels):
    fig_cm = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count"),
                       x=labels, y=labels, color_continuous_scale='Viridis', text_auto = True,
                       title="Confusion Matrix")
    fig_cm.update_layout(coloraxis_showscale=False)
    fig_cm.show()

In [12]:
from sklearn import datasets as db

In [13]:
#Se carga la base de datos
iris=db.load_iris()
#Se cambia la estrucctura de los datos
df = pd.DataFrame(iris['data'], columns=['Sepal.Length','Sepal.Width','Petal.Length','Petal.Width'])
df['Species']=pd.Categorical.from_codes(iris.target, categories=iris.target_names)
print(df)

     Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica

[150 rows x 5 columns]


In [14]:
X = iris.data  # Features
y = iris.target  # Labels

In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [16]:
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)
pca_df_2d = pd.DataFrame(data=X_pca_2d, columns=['PC1', 'PC2'])
pca_df_2d['Species'] = y
pca_df_2d.head()

Unnamed: 0,PC1,PC2,Species
0,-2.264703,0.480027,0
1,-2.080961,-0.674134,0
2,-2.364229,-0.341908,0
3,-2.299384,-0.597395,0
4,-2.389842,0.646835,0


In [17]:
fig_2d = px.scatter(pca_df_2d, x='PC1', y='PC2', color='Species', template = 'plotly_white', title = 'PCA - 2 Components')
fig_2d.show()

In [18]:
explained_variance_2d = pca_2d.explained_variance_ratio_
print("Explained Variance Ratio (2D):", explained_variance_2d)

Explained Variance Ratio (2D): [0.72962445 0.22850762]


In [20]:
components = pd.DataFrame(abs(pca_2d.components_), columns=iris.feature_names)
components

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.521066,0.269347,0.580413,0.564857
1,0.377418,0.923296,0.024492,0.066942


###Clasificacion###

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['Species'], test_size=0.2, random_state=7)

In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
pca = PCA(n_components=2)
X_train_pca_2d = pca.fit_transform(X_train_scaled)
X_test_pca_2d = pca.transform(X_test_scaled)

##Regresion Logistica##

In [69]:
lr_pca_2d = LogisticRegression()
lr_pca_2d.fit(X_train_pca_2d, y_train)
lr_pca_2d_pred = lr_pca_2d.predict(X_test_pca_2d)

In [70]:
lr_pca_2d_accuracy = accuracy_score(y_test, lr_pca_2d_pred)
lr_pca_2d_precision = precision_score(y_test, lr_pca_2d_pred,  average="micro")
lr_pca_2d_recall = recall_score(y_test, lr_pca_2d_pred,  average="micro")
lr_pca_2d_f1 = f1_score(y_test, lr_pca_2d_pred,  average="micro")
lr_pca_2d_report = classification_report(y_test, lr_pca_2d_pred)
print("Logistic Regression PCA 2D Classification Report:")
print(lr_pca_2d_report)

Logistic Regression PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.67      0.83      0.74        12
   virginica       0.75      0.55      0.63        11

    accuracy                           0.77        30
   macro avg       0.81      0.79      0.79        30
weighted avg       0.78      0.77      0.76        30



In [71]:
lr_pca_2d_cm = confusion_matrix(y_test, lr_pca_2d_pred)
plot_confusion_matrix(lr_pca_2d_cm, ['setosa','versicolor','virginica'])

##KNN##

In [72]:
knn_pca_2d = KNeighborsClassifier()
knn_pca_2d.fit(X_train_pca_2d, y_train)
knn_pca_2d_pred = knn_pca_2d.predict(X_test_pca_2d)

In [73]:
knn_pca_2d_accuracy = accuracy_score(y_test, knn_pca_2d_pred)
knn_pca_2d_precision = precision_score(y_test, knn_pca_2d_pred, average="micro")
knn_pca_2d_recall = recall_score(y_test, knn_pca_2d_pred,  average="micro")
knn_pca_2d_f1 = f1_score(y_test, knn_pca_2d_pred,  average="micro")
knn_pca_2d_report = classification_report(y_test, knn_pca_2d_pred)
print("KNN PCA 2D Classification Report:")
print(knn_pca_2d_report)

KNN PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.73      0.92      0.81        12
   virginica       0.88      0.64      0.74        11

    accuracy                           0.83        30
   macro avg       0.87      0.85      0.85        30
weighted avg       0.85      0.83      0.83        30



In [74]:
knn_pca_2d_cm = confusion_matrix(y_test, knn_pca_2d_pred)
plot_confusion_matrix(knn_pca_2d_cm, ['setosa','versicolor', 'virginica'])

##SVM##

In [75]:
svm_pca_2d = SVC()
svm_pca_2d.fit(X_train_pca_2d, y_train)
svm_pca_2d_pred = svm_pca_2d.predict(X_test_pca_2d)

In [76]:
svm_pca_2d_accuracy = accuracy_score(y_test, svm_pca_2d_pred)
svm_pca_2d_precision = precision_score(y_test, svm_pca_2d_pred, average="micro")
svm_pca_2d_recall = recall_score(y_test, svm_pca_2d_pred, average="micro")
svm_pca_2d_f1 = f1_score(y_test, svm_pca_2d_pred, average="micro")
svm_pca_2d_report = classification_report(y_test, svm_pca_2d_pred)
print("SVM PCA 2D Classification Report:")
print(svm_pca_2d_report)

SVM PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.71      0.83      0.77        12
   virginica       0.78      0.64      0.70        11

    accuracy                           0.80        30
   macro avg       0.83      0.82      0.82        30
weighted avg       0.80      0.80      0.80        30



In [77]:
svm_pca_2d_cm = confusion_matrix(y_test, svm_pca_2d_pred)
plot_confusion_matrix(svm_pca_2d_cm, ['setosa','versicolor', 'virginica'])

##Naive Bayes##

In [78]:
nb_pca_2d = GaussianNB()
nb_pca_2d.fit(X_train_pca_2d, y_train)
nb_pca_2d_pred = nb_pca_2d.predict(X_test_pca_2d)

In [79]:
nb_pca_2d_accuracy = accuracy_score(y_test, nb_pca_2d_pred)
nb_pca_2d_precision = precision_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_recall = recall_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_f1 = f1_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_report = classification_report(y_test, nb_pca_2d_pred)
print("Naive Bayes PCA 2D Classification Report:")
print(nb_pca_2d_report)

Naive Bayes PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.67      0.83      0.74        12
   virginica       0.75      0.55      0.63        11

    accuracy                           0.77        30
   macro avg       0.81      0.79      0.79        30
weighted avg       0.78      0.77      0.76        30



In [80]:
nb_pca_2d_cm = confusion_matrix(y_test, nb_pca_2d_pred)
plot_confusion_matrix(nb_pca_2d_cm, ['setosa','versicolor', 'virginica'])

##PCA con 3 componentes##

In [39]:
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_scaled)
pca_df_3d = pd.DataFrame(data=X_pca_3d, columns=['PC1', 'PC2', 'PC3'])
pca_df_3d['Species'] = y
pca_df_3d.head()

Unnamed: 0,PC1,PC2,PC3,Species
0,-2.264703,0.480027,-0.127706,0
1,-2.080961,-0.674134,-0.234609,0
2,-2.364229,-0.341908,0.044201,0
3,-2.299384,-0.597395,0.09129,0
4,-2.389842,0.646835,0.015738,0


In [40]:
fig_3d = px.scatter_3d(pca_df_3d, x='PC1', y='PC2', z='PC3', color='Species', template = 'plotly_white', title = 'PCA - 3 Components')
fig_3d.show()

In [41]:
explained_variance_3d = pca_3d.explained_variance_ratio_
print("Explained Variance Ratio (3D):", explained_variance_3d)

Explained Variance Ratio (3D): [0.72962445 0.22850762 0.03668922]


In [42]:
components = pd.DataFrame(abs(pca_3d.components_), columns=iris.feature_names)
components

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.521066,0.269347,0.580413,0.564857
1,0.377418,0.923296,0.024492,0.066942
2,0.719566,0.244382,0.142126,0.634273


In [44]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['Species'], test_size=0.2, random_state=7)

In [45]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [46]:
pca = PCA(n_components=3)
X_train_pca_3d = pca.fit_transform(X_train_scaled)
X_test_pca_3d = pca.transform(X_test_scaled)

##Regresion Logistica##

In [47]:
lr_pca_3d = LogisticRegression()
lr_pca_3d.fit(X_train_pca_3d, y_train)
lr_pca_3d_pred = lr_pca_3d.predict(X_test_pca_3d)

In [50]:
lr_pca_3d_accuracy = accuracy_score(y_test, lr_pca_3d_pred)
lr_pca_3d_precision = precision_score(y_test, lr_pca_3d_pred, average="micro")
lr_pca_3d_recall = recall_score(y_test, lr_pca_3d_pred, average="micro")
lr_pca_3d_f1 = f1_score(y_test, lr_pca_3d_pred, average= "micro")
lr_pca_3d_report = classification_report(y_test, lr_pca_3d_pred)
print("Logistic Regression PCA 3D Classification Report:")
print(lr_pca_3d_report)

Logistic Regression PCA 3D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.83      0.83      0.83        12
   virginica       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.88      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30



In [51]:
lr_pca_3d_cm = confusion_matrix(y_test, lr_pca_3d_pred)
plot_confusion_matrix(lr_pca_3d_cm, ['setosa','versicolor', 'virginica'])

KNN

In [53]:
knn_pca_3d = KNeighborsClassifier()
knn_pca_3d.fit(X_train_pca_3d, y_train)
knn_pca_3d_pred = knn_pca_3d.predict(X_test_pca_3d)

In [64]:
knn_pca_3d_accuracy = accuracy_score(y_test, knn_pca_3d_pred)
knn_pca_3d_precision = precision_score(y_test, knn_pca_3d_pred, average="micro")
knn_pca_3d_recall = recall_score(y_test, knn_pca_3d_pred, average="micro")
knn_pca_3d_f1 = f1_score(y_test, knn_pca_3d_pred, average="micro")
knn_pca_3d_report = classification_report(y_test, knn_pca_3d_pred)
print("KNN PCA 3D Classification Report:")
print(knn_pca_3d_report)

KNN PCA 3D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.79      0.92      0.85        12
   virginica       0.89      0.73      0.80        11

    accuracy                           0.87        30
   macro avg       0.89      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30



In [56]:
knn_pca_3d_cm = confusion_matrix(y_test, knn_pca_3d_pred)
plot_confusion_matrix(knn_pca_3d_cm, ['setosa','versicolor', 'virginica'])

##SVM

In [57]:
svm_pca_3d = SVC()
svm_pca_3d.fit(X_train_pca_3d, y_train)
svm_pca_3d_pred = svm_pca_3d.predict(X_test_pca_3d)

In [58]:
svm_pca_3d_accuracy = accuracy_score(y_test, svm_pca_3d_pred)
svm_pca_3d_precision = precision_score(y_test, svm_pca_3d_pred, average="micro")
svm_pca_3d_recall = recall_score(y_test, svm_pca_3d_pred, average="micro")
svm_pca_3d_f1 = f1_score(y_test, svm_pca_3d_pred, average="micro")
svm_pca_3d_report = classification_report(y_test, svm_pca_3d_pred)
print("SVM PCA 3D Classification Report:")
print(svm_pca_3d_report)

SVM PCA 3D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.83      0.83      0.83        12
   virginica       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.88      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30



In [59]:
svm_pca_2d_cm = confusion_matrix(y_test, svm_pca_2d_pred)
plot_confusion_matrix(svm_pca_2d_cm, ['setosa','versicolor', 'virginica'])

##Naive Bayes

In [60]:
nb_pca_3d = GaussianNB()
nb_pca_3d.fit(X_train_pca_3d, y_train)
nb_pca_3d_pred = nb_pca_3d.predict(X_test_pca_3d)

In [61]:
nb_pca_3d_accuracy = accuracy_score(y_test, nb_pca_3d_pred)
nb_pca_3d_precision = precision_score(y_test, nb_pca_3d_pred, average="micro")
nb_pca_3d_recall = recall_score(y_test, nb_pca_3d_pred, average="micro")
nb_pca_3d_f1 = f1_score(y_test, nb_pca_3d_pred, average="micro")
nb_pca_3d_report = classification_report(y_test, nb_pca_3d_pred)
print("Naive Bayes PCA 3D Classification Report:")
print(nb_pca_3d_report)

Naive Bayes PCA 3D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.82      0.75      0.78        12
   virginica       0.75      0.82      0.78        11

    accuracy                           0.83        30
   macro avg       0.86      0.86      0.86        30
weighted avg       0.84      0.83      0.83        30



In [63]:
nb_pca_3d_cm = confusion_matrix(y_test, nb_pca_3d_pred)
plot_confusion_matrix(nb_pca_3d_cm, ['setosa','versicolor', 'virginica'])

Resumen de los modelos con 2 componentes de la base de datos Iris

In [81]:
models_data1 = {
    'Model': ['Logistic Regression', 'K-Nearest Neighbors', 'SVM', 'Naive Bayes'],
    'Accuracy': [lr_pca_2d_accuracy, knn_pca_2d_accuracy, svm_pca_2d_accuracy, nb_pca_2d_accuracy],
    'Precision': [lr_pca_2d_precision, knn_pca_2d_precision, svm_pca_2d_precision, nb_pca_2d_precision],
    'Recall': [lr_pca_2d_recall, knn_pca_2d_recall, svm_pca_2d_recall, nb_pca_2d_recall],
    'F1-Score': [lr_pca_2d_f1, knn_pca_2d_f1, svm_pca_2d_f1, nb_pca_2d_f1]
}

df2_models = pd.DataFrame(models_data1)
df2_models

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.766667,0.766667,0.766667,0.766667
1,K-Nearest Neighbors,0.833333,0.833333,0.833333,0.833333
2,SVM,0.8,0.8,0.8,0.8
3,Naive Bayes,0.766667,0.766667,0.766667,0.766667


Resumen de los modelos con 3 componentes de la base de datos Iris

In [67]:
models_data = {
    'Model': ['Logistic Regression', 'K-Nearest Neighbors', 'SVM', 'Naive Bayes'],
    'Accuracy': [lr_pca_3d_accuracy, knn_pca_3d_accuracy, svm_pca_3d_accuracy, nb_pca_3d_accuracy],
    'Precision': [lr_pca_3d_precision, knn_pca_3d_precision, svm_pca_3d_precision, nb_pca_3d_precision],
    'Recall': [lr_pca_3d_recall, knn_pca_3d_recall, svm_pca_3d_recall, nb_pca_3d_recall],
    'F1-Score': [lr_pca_3d_f1, knn_pca_3d_f1, svm_pca_3d_f1, nb_pca_3d_f1]
}

dfp_models = pd.DataFrame(models_data)
dfp_models

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.866667,0.866667,0.866667,0.866667
1,K-Nearest Neighbors,0.866667,0.866667,0.866667,0.866667
2,SVM,0.866667,0.866667,0.866667,0.866667
3,Naive Bayes,0.833333,0.833333,0.833333,0.833333
