In [None]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import datasets
import matplotlib.pyplot as plt
sns.set_theme()

In [None]:
# leer dataset digits
df = pd.read_csv("../data/digits.csv")
df.head(15)

# EDAs

In [None]:
# separar características de imagen
pixels = df.drop('number_label', axis=1)

In [None]:
# seleccionar imagen individual
single_img = pixels.iloc[0]
single_img.to_numpy().shape

In [None]:
# mostrar matriz de 8x8
print(single_img.to_numpy().reshape(8,8))

In [None]:
# mostrar imagen en 2D con imshow
plt.imshow(single_img.to_numpy().reshape(8,8), cmap='viridis')

In [None]:
# visualizar matriz en forma de heatmap
sns.heatmap(single_img.to_numpy().reshape(8,8), cmap='viridis', annot=True)

# Modelado base

In [None]:
# definir función para entrenar y evaluar varios modelos
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

def calc_predictions(X_train, X_test, y_train, y_test):

    df_results = pd.DataFrame(columns=['model_name', 'accuracy', 'precision', 'recall_sensitivity', 'f1'])

    models = [
        LogisticRegression(max_iter=10000),
        KNeighborsClassifier(n_neighbors=11),
        SVC(),
        DecisionTreeClassifier(),
        RandomForestClassifier(n_estimators=50),
        AdaBoostClassifier(n_estimators=100),
        GradientBoostingClassifier(n_estimators=100)
    ]

    for model in models:
        # entrenamiento y predicciones
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # classification metrics
        model_name = model.__class__.__name__
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='micro')
        recall_sensitivity = recall_score(y_test, y_pred, average='micro')
        f1 = f1_score(y_test, y_pred, average='micro')

        # guardar resultados
        row = [model_name, accuracy, precision, recall_sensitivity, f1]
        df_results.loc[len(df_results)] = row

    return df_results

In [None]:
# dividir datos en train/test
X = df.drop('number_label', axis=1)
y = df['number_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
%%time
# medir tiempo de ejecución para entrenamiento y evaluación
calc_predictions(X_train, X_test, y_train, y_test)

# Escalado

In [None]:
# escalar datos con MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
%%time
# medir tiempo de ejecución con datos escalados
calc_predictions(X_train_scaled, X_test_scaled, y_train, y_test)

# PCA 2d

In [None]:
# reducir dimensionalidad a 2 componentes con PCA
from sklearn.decomposition import PCA

pca_2d = PCA(n_components=2)
pca_2d.fit(X_train_scaled)

X_train_scaled_pca_2d = pca_2d.transform(X_train_scaled)
X_test_scaled_pca_2d = pca_2d.transform(X_test_scaled)

In [None]:
# calcular varianza explicada acumulada en 2D
sum(pca_2d.explained_variance_ratio_)

In [None]:
%%time
# medir tiempo de ejecución con PCA 2D
calc_predictions(X_train_scaled_pca_2d, X_test_scaled_pca_2d, y_train, y_test)

In [None]:
# escalar datos con StandardScaler y aplicar PCA 2D
from sklearn.preprocessing import StandardScaler

# X_scaled = MinMaxScaler().fit_transform(X)
X_scaled = StandardScaler().fit_transform(X)
X_scaled_pca = PCA(n_components=2).fit_transform(X_scaled)
X_scaled_pca

In [None]:
# graficar proyección 2D con scatterplot
labels = df['number_label']
plt.figure(figsize=(10,6), dpi=150)
sns.scatterplot(x=X_scaled_pca[:, 0], y=X_scaled_pca[:, 1], hue=labels, palette='Set1')

# PCA 3d

In [None]:
# reducir dimensionalidad a 3 componentes con PCA
pca_3d = PCA(n_components=3)
pca_3d.fit(X_train_scaled)

X_train_scaled_pca_3d = pca_3d.transform(X_train_scaled)
X_test_scaled_pca_3d = pca_3d.transform(X_test_scaled)
sum(pca_3d.explained_variance_ratio_)

In [None]:
%%time
# medir tiempo de ejecución con PCA 3D
calc_predictions(X_train_scaled_pca_3d, X_test_scaled_pca_3d, y_train, y_test)

In [None]:
# escalar datos con StandardScaler y aplicar PCA 3D
# X_scaled = MinMaxScaler().fit_transform(X)
X_scaled = StandardScaler().fit_transform(X)
X_scaled_pca = PCA(n_components=3).fit_transform(X_scaled)
X_scaled_pca

In [None]:
# graficar proyección 3D con matplotlib
labels = df['number_label']
plt.figure(figsize=(10, 6), dpi=150)
ax = plt.axes(projection='3d')
ax.scatter3D(X_scaled_pca[:, 0], X_scaled_pca[:, 1], X_scaled_pca[:, 2], c=labels, cmap='Set1')

In [None]:
# crear dataframe con componentes principales
import plotly.express as px
df = pd.DataFrame(X_scaled_pca, columns=['PC1', 'PC2', 'PC3'])
df

In [None]:
# graficar proyección 3D con plotly
fig = px.scatter_3d(df, x='PC1', y='PC2', z='PC3', color=labels, )
fig.show()

# Dataset extendido

In [None]:
# cargar dataset mnist de OpenML
mnist = datasets.fetch_openml("mnist_784")

In [None]:
# mostrar descripción del dataset
print(mnist.DESCR)

In [None]:
# crear dataframe a partir de mnist y añadir columna de clase
df = pd.DataFrame(mnist.data, columns=mnist.feature_names)
df['class'] = mnist.target
df.head()

In [None]:
# mostrar dimensiones del dataframe
df.shape

In [None]:
# separar datos en X e y y dividir en train/test
X = df.drop('class', axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# %%time
# medir tiempo de ejecución
# CUIDADO: puede tardar bastante tiempo
# calc_predictions(X_train, X_test, y_train, y_test)

In [None]:
# escalar datos y reducir dimensionalidad con PCA a 50 componentes
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=50)
pca.fit(X_train_scaled)

X_train_scaled_pca = pca.transform(X_train_scaled)
X_test_scaled_pca = pca.transform(X_test_scaled)

print(X_train_scaled_pca.shape)
print(sum(pca.explained_variance_ratio_))

In [None]:
# %%time
# medir tiempo de ejecución con PCA
# CUIDADO: puede tardar bastante tiempo
# Comprobar si tarda menos que el anterior
# calc_predictions(X_train_scaled_pca, X_test_scaled_pca, y_train, y_test)