In [2]:
import pandas as pd                  # A fundamental package for linear algebra and multidimensional arrays
import numpy as np                   # Data analysis and data manipulating tool
import random                        # Library to generate random numbers
from collections import Counter      # Collection is a Python module that implements specialized container datatypes providing
                                     # alternatives to Python’s general purpose built-in containers, dict, list, set, and tuple.
                                     # Counter is a dict subclass for counting hashable objects
# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# To ignore warnings in the notebook
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import LeaveOneOut, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix

In [3]:
# This is a subset of the original data available at kaggle.
data = pd.read_csv("C:\\Users\\Polar\\Documents\\ESCUELA\\5TO_SEMESTRE\\MAKINITAS\\practica8\\glass.csv")

data.head()

Unnamed: 0,Id_number,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type_of_glass
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


EUCLIDIANO

In [5]:

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
# Codificar la columna 'class' como valores numéricos
label_encoder = LabelEncoder()
data['Type_of_glass'] = label_encoder.fit_transform(data['Type_of_glass'])

# Separar características y etiquetas
X = data.drop(columns=['Type_of_glass']).values
y = data['Type_of_glass'].values

# Función para calcular la distancia Euclidiana
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

# Función para clasificar un nuevo punto usando el algoritmo de distancia mínima
def classify_min_distance(new_point, centroids):
    min_distance = float('inf')
    closest_class = None
    
    for species, centroid in centroids.iterrows():
        distance = euclidean_distance(new_point, centroid)
        if distance < min_distance:
            min_distance = distance
            closest_class = species
    return closest_class

# --- Leave-One-Out Cross-Validation ---
loo = LeaveOneOut()
correct_predictions_loo = 0
y_pred_loo = []

for train_index, test_index in loo.split(X):
    # Dividir datos en entrenamiento y prueba
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Calcular centroides con los datos de entrenamiento
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['Type_of_glass'] = y_train
    centroids = train_data.groupby('Type_of_glass').mean()
    
    # Clasificar el punto de prueba
    predicted_class = classify_min_distance(X_test[0], centroids)
    y_pred_loo.append(predicted_class)
    
    # Comparar con la clase real
    if predicted_class == y_test[0]:
        correct_predictions_loo += 1

# Calcular y mostrar la precisión de Leave-One-Out
accuracy_loo = correct_predictions_loo / len(X)
print(f"Precisión de Leave-One-Out Cross-Validation: {accuracy_loo:.2f}")

# Imprimir la matriz de confusión para Leave-One-Out
print("Matriz de Confusión Leave-One-Out:")
print(confusion_matrix(y, y_pred_loo))

# --- Hold-Out Validation (70-30 split) ---
# Dividir en entrenamiento y prueba con un 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Calcular centroides con el conjunto de entrenamiento
train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
train_data['Type_of_glass'] = y_train
centroids = train_data.groupby('Type_of_glass').mean()

# Validar en el conjunto de prueba
correct_predictions_holdout = 0
y_pred_holdout = []

for i, test_point in enumerate(X_test):
    predicted_class = classify_min_distance(test_point, centroids)
    y_pred_holdout.append(predicted_class)
    if predicted_class == y_test[i]:
        correct_predictions_holdout += 1

# Calcular y mostrar la precisión de Hold-Out
accuracy_holdout = correct_predictions_holdout / len(X_test)
print(f"Precisión de Hold-Out Validation (70-30): {accuracy_holdout:.2f}")

# Imprimir la matriz de confusión para Hold-Out
print("Matriz de Confusión Hold-Out:")
print(confusion_matrix(y_test, y_pred_holdout))

# --- 10-Fold Cross-Validation Estratificado ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
correct_predictions_kfold = 0
total_test_samples = 0
y_pred_kfold = []

for train_index, test_index in skf.split(X, y):
    # Dividir datos en entrenamiento y prueba para el fold actual
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Calcular centroides con los datos de entrenamiento del fold actual
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['Type_of_glass'] = y_train
    centroids = train_data.groupby('Type_of_glass').mean()
    
    # Validar en el conjunto de prueba
    for i, test_point in enumerate(X_test):
        predicted_class = classify_min_distance(test_point, centroids)
        y_pred_kfold.append(predicted_class)
        if predicted_class == y_test[i]:
            correct_predictions_kfold += 1
    
    # Actualizar el conteo de muestras de prueba
    total_test_samples += len(X_test)

# Calcular y mostrar la precisión de 10-Fold Cross-Validation
accuracy_kfold = correct_predictions_kfold / total_test_samples
print(f"Precisión de 10-Fold Cross-Validation Estratificado: {accuracy_kfold:.2f}")

# Imprimir la matriz de confusión para 10-Fold Cross-Validation
print("Matriz de Confusión 10-Fold Cross-Validation:")
print(confusion_matrix(y, y_pred_kfold))

Precisión de Leave-One-Out Cross-Validation: 0.89
Matriz de Confusión Leave-One-Out:
[[70  0  0  0  0  0]
 [ 2 59 15  0  0  0]
 [ 0  0 16  1  0  0]
 [ 0  0  0 12  1  0]
 [ 0  0  0  0  9  0]
 [ 0  0  0  0  5 24]]
Precisión de Hold-Out Validation (70-30): 0.89
Matriz de Confusión Hold-Out:
[[19  0  0  0  0  0]
 [ 0 18  5  0  0  0]
 [ 0  0  4  0  0  0]
 [ 0  0  0  5  1  0]
 [ 0  0  0  0  3  0]
 [ 0  0  0  0  1  9]]
Precisión de 10-Fold Cross-Validation Estratificado: 0.89
Matriz de Confusión 10-Fold Cross-Validation:
[[27 19  9  3  5  7]
 [24 23 13  6  3  7]
 [ 7  5  0  1  1  3]
 [ 4  0  4  0  3  2]
 [ 3  6  0  0  0  0]
 [ 7  6  6  2  4  4]]


SMOTE

In [6]:
from imblearn.over_sampling import SMOTE
from collections import Counter

counter = Counter(y_train)
print('Before', counter)

# oversampling the train dataset using SMOTE
smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After', counter)

Before Counter({np.int64(1): 69, np.int64(0): 63, np.int64(5): 26, np.int64(2): 15, np.int64(3): 12, np.int64(4): 8})
After Counter({np.int64(0): 69, np.int64(1): 69, np.int64(2): 69, np.int64(3): 69, np.int64(4): 69, np.int64(5): 69})


In [None]:
Prediccion euclidiana con Metodo Smote aplicado

In [7]:



# Función para calcular la distancia Euclidiana
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

# Función para clasificar un nuevo punto usando el algoritmo de distancia mínima
def classify_min_distance(new_point, centroids):
    min_distance = float('inf')
    closest_class = None
    
    for species, centroid in centroids.iterrows():
        distance = euclidean_distance(new_point, centroid)
        if distance < min_distance:
            min_distance = distance
            closest_class = species
    return closest_class

# --- Leave-One-Out Cross-Validation ---
loo = LeaveOneOut()
correct_predictions_loo = 0
y_pred_loo = []

for train_index, test_index in loo.split(X):
    # Dividir datos en entrenamiento y prueba
    X_train, X_test = X_train_sm[train_index], X_train_sm[test_index]
    y_train, y_test = y_train_sm[train_index], y_train_sm[test_index]
    
    # Calcular centroides con los datos de entrenamiento
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['Type_of_glass'] = y_train
    centroids = train_data.groupby('Type_of_glass').mean()
    
    # Clasificar el punto de prueba
    predicted_class = classify_min_distance(X_test[0], centroids)
    y_pred_loo.append(predicted_class)
    
    # Comparar con la clase real
    if predicted_class == y_test[0]:
        correct_predictions_loo += 1

# Calcular y mostrar la precisión de Leave-One-Out
accuracy_loo = correct_predictions_loo / len(X)
print(f"Precisión de Leave-One-Out Cross-Validation: {accuracy_loo:.2f}")

# Imprimir la matriz de confusión para Leave-One-Out
print("Matriz de Confusión Leave-One-Out:")
print(confusion_matrix(y, y_pred_loo))

# --- Hold-Out Validation (70-30 split) ---
# Dividir en entrenamiento y prueba con un 70-30
X_train, X_test, y_train, y_test = train_test_split(X_train_sm, y_train_sm, test_size=0.3, random_state=42)

# Calcular centroides con el conjunto de entrenamiento
train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
train_data['Type_of_glass'] = y_train
centroids = train_data.groupby('Type_of_glass').mean()

# Validar en el conjunto de prueba
correct_predictions_holdout = 0
y_pred_holdout = []

for i, test_point in enumerate(X_test):
    predicted_class = classify_min_distance(test_point, centroids)
    y_pred_holdout.append(predicted_class)
    if predicted_class == y_test[i]:
        correct_predictions_holdout += 1

# Calcular y mostrar la precisión de Hold-Out
accuracy_holdout = correct_predictions_holdout / len(X_test)
print(f"Precisión de Hold-Out Validation (70-30): {accuracy_holdout:.2f}")

# Imprimir la matriz de confusión para Hold-Out
print("Matriz de Confusión Hold-Out:")
print(confusion_matrix(y_test, y_pred_holdout))

# --- 10-Fold Cross-Validation Estratificado ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
correct_predictions_kfold = 0
total_test_samples = 0
y_pred_kfold = []

for train_index, test_index in skf.split(X, y):
    # Dividir datos en entrenamiento y prueba para el fold actual
    X_train, X_test = X_train_sm[train_index], X_train_sm[test_index]
    y_train, y_test = y_train_sm[train_index], y_train_sm[test_index]
    
    # Calcular centroides con los datos de entrenamiento del fold actual
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['Type_of_glass'] = y_train
    centroids = train_data.groupby('Type_of_glass').mean()
    
    # Validar en el conjunto de prueba
    for i, test_point in enumerate(X_test):
        predicted_class = classify_min_distance(test_point, centroids)
        y_pred_kfold.append(predicted_class)
        if predicted_class == y_test[i]:
            correct_predictions_kfold += 1
    
    # Actualizar el conteo de muestras de prueba
    total_test_samples += len(X_test)

# Calcular y mostrar la precisión de 10-Fold Cross-Validation
accuracy_kfold = correct_predictions_kfold / total_test_samples
print(f"Precisión de 10-Fold Cross-Validation Estratificado: {accuracy_kfold:.2f}")

# Imprimir la matriz de confusión para 10-Fold Cross-Validation
print("Matriz de Confusión 10-Fold Cross-Validation:")
print(confusion_matrix(y, y_pred_kfold))

Precisión de Leave-One-Out Cross-Validation: 0.90
Matriz de Confusión Leave-One-Out:
[[64  6  0  0  0  0]
 [ 0 49 27  0  0  0]
 [ 0  0  0 12  5  0]
 [ 0  0  0  0  9  4]
 [ 0  0  0  0  0  9]
 [ 6  0 15  0  0  8]]
Precisión de Hold-Out Validation (70-30): 0.94
Matriz de Confusión Hold-Out:
[[25  0  0  0  0  0]
 [ 1 22  3  0  0  0]
 [ 0  0 16  0  0  0]
 [ 0  0  0 20  0  0]
 [ 0  0  0  0 18  0]
 [ 0  0  0  0  3 17]]
Precisión de 10-Fold Cross-Validation Estratificado: 0.90
Matriz de Confusión 10-Fold Cross-Validation:
[[24 19 13  4  5  5]
 [24 21 15  3  6  7]
 [ 7  4  4  0  0  2]
 [ 4  0  4  1  1  3]
 [ 2  7  0  0  0  0]
 [ 9  4  6  4  2  4]]
