## Proyecto 1 - Aprendizaje Automático
### Integrantes:
- A. Badilla Olivas B80874
- Enrique Vilchez Lizano C18477
- Brandon Mora Umaña 
- Joseph Valverde Kong C18100

In [None]:
# Import necessary modules
import numpy as np

# Models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Data handling
import pandas as pd

# Preprocessing tools
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

# Model selection tools
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold

# Plots
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import seaborn as sns

# Utils
from typing import Tuple, List
import warnings

In [None]:
def make_grid_search(
    model: object, search_space: dict, X: pd.DataFrame, Y: pd.Series, verbose: int = 1
) -> tuple:
    """
    Perform grid search to find the best hyperparameters for a given model.

    Args:
        model (object): The machine learning model from the sci kit learn library to be used for grid search.
        search_space (dict): The hyperparameter search space.
        X (pd.DataFrame): The input features.
        Y (pd.Series): The target variable.
        verbose (int, optional): Verbosity level. Default is 1. 

    Returns:
        tuple: A tuple containing the best score, best parameters, and the results of the grid search.

    """
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=search_space,
        scoring=["accuracy", "precision", "recall", "roc_auc"],
        refit="roc_auc",
        cv=5,
        verbose=verbose,
    )

    grid_search.fit(X, Y)
    results = pd.DataFrame(grid_search.cv_results_)
    return grid_search.best_score_, grid_search.best_params_, results

In [None]:
def fit_and_get_metrics(model: object, X: pd.DataFrame, Y: pd.Series, n_folds: int) -> Tuple[pd.DataFrame, List[List[float]]]:
    """
    Performs cross validation on a model and gets the metrics mean.
    
    Args:
        model (object): The machine learning model from the sci kit learn library to be used for cross validation.
        X (pd.DataFrame): The input features.
        Y (pd.Series): The target variable.
        n_folds (int): Number of folds for cross validation.
        
    Returns:
        Tuple[pd.DataFrame, List[List[float]]]: A data frame containing the mean values of accuracies, precisions, recalls, roc_auc_scores, and the confusion matrix.

    """
    # Get cross validation indices
    kf = StratifiedKFold(n_splits=n_folds)
    
    # Metrics
    accuracies = []
    precisions = []
    recalls = []
    rocs = []
    confusion = []
    
    # Cross validate
    for iteration, (train_index, test_index) in enumerate(kf.split(X, Y)):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        model.fit(X_train, Y_train)
        
        # Get metrics
        y_predicted = model.predict(X_test)
        accuracies.append(accuracy_score(Y_test, y_predicted))
        precisions.append(precision_score(Y_test, y_predicted))
        recalls.append(recall_score(Y_test, y_predicted))
        rocs.append(roc_auc_score(Y_test, y_predicted))
        if iteration == 0:
            confusion = confusion_matrix(Y_test, y_predicted)
    
    columns = ['Accuracy', 'Precision', 'Recall', 'Roc & Auc']
    metrics = [np.mean(accuracies), np.mean(precisions), np.mean(recalls), np.mean(rocs)]
    
    dataframe = {'Metric': columns, 'Value': metrics}
    return pd.DataFrame(dataframe), confusion

In [None]:
def plot_metrics(metrics: pd.DataFrame, title: str = None, color: object = None):
    """
    Plot metrics as a bar plot.

    Args:
        metrics (pd.DataFrame): Pandas DataFrame with metrics.
        title (str, optional): Title for the plot.
        colors (object, optional): Colors used for the heat map. Defaults to None.
    """
    """
    # Create the bar plot
    """
    rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y))

    plt.figure(figsize=(8, 4))
    if color is not None:
        bar_plot = plt.barh(metrics['Metric'], metrics['Value'], color=color(rescale(metrics['Value'])))
    else:
        bar_plot = plt.barh(metrics['Metric'], metrics['Value'])

    # Add a border
    for bar in bar_plot:
        bar.set_edgecolor("black")
        bar.set_linewidth(0.5)

    x_offset = 0.01
    # Add exact values on the bars
    for index, value in enumerate(metrics['Value']):
        plt.text(value + x_offset, index, f'{value:.4f}', va='center', ha='left', fontsize=8)

    # Add labels and title
    plt.xlabel('Value')
    plt.ylabel('Metric')
    if title is not None:
        plt.title(title)
    else:
        plt.title('Model Evaluation Metrics')

    # Increase limits
    plt.xlim(0, max(metrics['Value']) * 1.15)

    # Invert y-axis to have the bars ordered top to bottom
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
def plot_confusion_matrix(confusion_matrix: List[List[float]], labels: List[str] = None, title: str = None, colors: object = None):
    """
    Plots a confusion matrix.
    
    Args:
        confusion_matrix (List[List[float]]): Confusion matrix.
        labels (List[str], optional): Display labels. Defaults to None (number from 0 to n_classes are chosen).
        title (str, optional): Title for the plot. Defaults to None.
        colors (object, optional): Colors used for the heat map. Defaults to None.
    """
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=labels)
    disp.plot(cmap=colors)
            
    # Display title
    if title is not None:
        disp.ax_.set_title(title)
        
    plt.show()

# Wine Quality Dataset

## Dataset Preparation

In [None]:
# read and explore the dataset
wine_data = pd.read_csv("datasets/winequality_red.csv")

In [None]:
wine_data.head()

In [None]:
wine_data.describe()

In [None]:
wine_data.corr()

In [None]:
# Remove rows with empty data on the label
wine_data.dropna(axis=0, subset=["quality"], inplace=True)

# Separate data
y_wine = wine_data["quality"]
X_wine = wine_data.drop(columns=["quality"])
X_wine.describe()

# Modify quality labels to 1 or 0
y_wine = y_wine.apply(lambda x: 1 if x > 6 else 0)
y_wine.value_counts()

# Scale the data
normalizer = StandardScaler()
X_wine_normalized = pd.DataFrame(normalizer.fit_transform(X_wine))

# After we transform the data, we should put back the column names
X_wine_normalized.columns = X_wine.columns.astype(str)

## Logistic Regression

In [None]:
# Parameters selection
search_space_log = {
    "fit_intercept": [True, False],  # Add bias constant or not
    "solver": ["lbfgs", "liblinear", "newton-cholesky", "newton-cg", "sag", "saga"],
    "max_iter": [1, 5, 10, 30, 50, 100],
}

log_classifier = LogisticRegression(random_state=1)
best_log_score, best_log_params, results_log = make_grid_search(
    model=log_classifier, search_space=search_space_log, X=X_wine_normalized, Y=y_wine
)

# Suppress all warnings
warnings.filterwarnings("ignore")

In [None]:
print("Tuned hyperparameters (best parameters): ", best_log_params)
print("Best score:", best_log_score)

In [None]:
log_classifier = LogisticRegression(random_state=1, **best_log_params)
metrics, confusion_mat = fit_and_get_metrics(
    model=log_classifier, X=X_wine_normalized, Y=y_wine, n_folds=5
)
plot_metrics(
    metrics=metrics,
    title='Wine Dataset Logistic Regression Metrics',
    color=plt.cm.Blues
)

In [None]:
plot_confusion_matrix(
    confusion_matrix=confusion_mat,
    labels=['Bad Wine', 'Good Wine'],
    title='Wine Dataset Logistic Regression Confusion Matrix (First Iteration)',
    colors=plt.cm.Blues
)

## Decision Trees

In [None]:
# Parameters selection
search_space_tree = {
    "criterion": ["gini", "entropy", "log_loss"],  # Loss criteria
    "splitter": ["best", "random"],  # Type of split for the nodes
    "max_depth": [1, 5, 10, 30, 50, 100],  # Maxiumun tree depth
    "min_samples_split": [
        1,
        2,
        5,
        10,
    ],  # Minimun number of samples needed to split a node
}

tree_classifier = DecisionTreeClassifier(random_state=1)
best_tree_score, best_tree_params, tree_results = make_grid_search(
    model=tree_classifier, search_space=search_space_tree, X=X_wine_normalized, Y=y_wine
)

In [None]:
print("Tuned hyperparameters (best parameters): ", best_tree_params)
print("Best score :", best_tree_score)

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=1, **best_tree_params)
metrics, confusion_mat = fit_and_get_metrics(
    model=tree_classifier, X=X_wine_normalized, Y=y_wine, n_folds=5
)
plot_metrics(
    metrics=metrics,
    title='Wine Dataset Decision Tree Metrics',
    color=plt.cm.Reds
)

In [None]:
plot_confusion_matrix(
    confusion_matrix=confusion_mat,
    labels=['Bad Wine', 'Good Wine'],
    title='Wine Dataset Decision Tree Confusion Matrix (First Iteration)',
    colors=plt.cm.Reds
)

## KNN

In [None]:
knn_search_space = {
    "n_neighbors": [3, 5, 7, 9, 11],
    "weights": ["uniform", "distance"],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
    "p": [1, 2],
}

knn_classifier = KNeighborsClassifier()

best_knn_score, best_knn_params, results_knn = make_grid_search(
    model=knn_classifier, search_space=knn_search_space, X=X_wine_normalized, Y=y_wine
)

In [None]:
print("Tuned hyperparameters (best parameters): ", best_knn_params)
print("Best score :", best_knn_score)

In [None]:
knn_classifier = KNeighborsClassifier(**best_knn_params)
metrics, confusion_mat = fit_and_get_metrics(
    model=knn_classifier, X=X_wine_normalized, Y=y_wine, n_folds=5
)
plot_metrics(
    metrics=metrics,
    title='Wine Dataset KNN Metrics',
    color=plt.cm.Greens
)

In [None]:
plot_confusion_matrix(
    confusion_matrix=confusion_mat,
    labels=['Bad Wine', 'Good Wine'],
    title='Wine Dataset KNN Confusion Matrix (First Iteration)',
    colors=plt.cm.Greens
)

## Neural Network

In [None]:
# Parameters selection
search_space_mlp = {
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver": ["lbfgs", "sgd", "adams"],
    "learning_rate": ["constant", "invscaling", "adaptive"],
    "warm_start": [True, False],
    "early_stopping": [True, False],
}

mlp_classifier = MLPClassifier(random_state=1)
best_mlp_score, best_mlp_params, mlp_results = make_grid_search(
    model=mlp_classifier, search_space=search_space_mlp, X=X_wine_normalized, Y=y_wine
)

In [None]:
print("Tuned hyperparameters (best parameters): ", best_mlp_params)
print("Best score :", best_mlp_score)

In [None]:
mlp_classifier = MLPClassifier(random_state=1, **best_mlp_params)
metrics, confusion_mat = fit_and_get_metrics(
    model=mlp_classifier, X=X_wine_normalized, Y=y_wine, n_folds=5
)
plot_metrics(
    metrics=metrics,
    title='Wine Dataset Neural Network Metrics',
    color=plt.cm.Purples
)

In [None]:
plot_confusion_matrix(
    confusion_matrix=confusion_mat,
    labels=['Bad Wine', 'Good Wine'],
    title='Wine Dataset Neural Network Confusion Matrix (First Iteration)',
    colors=plt.cm.Purples
)

# Credit Card Dataset

## Dataset Preparation

In [None]:
# Read dataset
credit_card_data = pd.read_csv("datasets/credit_card_approvals.csv")

In [None]:
credit_card_data.head()

In [None]:
credit_card_data.describe()

In [None]:
# Remove empty rows
credit_card_data.dropna(axis=0, subset=["Approved"], inplace=True)

# Separate data
y_credit = credit_card_data["Approved"]
X_credit = credit_card_data.drop(columns=["Approved"])

# Get categorical and numerical columns
cat_cols = X_credit.select_dtypes(include="object").columns.tolist()
num_cols = X_credit.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Normalize numerical values and transformm categorical ones
pipeline = ColumnTransformer(
    [
        ("numerical", StandardScaler(), num_cols),
        ("categorical", OrdinalEncoder(), cat_cols),
    ]
)

X_credit_normalized = pd.DataFrame(pipeline.fit_transform(X_credit))
X_credit_normalized.columns = num_cols + cat_cols

## Logistic Regression

## Decision Trees

## KNN

## Neural Network