## Proyecto 1 - Aprendizaje Automático
### Integrantes:
- A. Badilla Olivas B80874
- Enrique Vilchez Lizano C18477
- Brandon Mora Umaña 
- Joseph Valverde Kong C18100

In [None]:
# Import necessary modules
import numpy as np

# Models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Data handling
import pandas as pd

# Preprocessing tools
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# Model selection tools
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
def make_grid_search(
    model: object, search_space: dict, X: pd.DataFrame, Y: pd.Series, verbose: int = 1
) -> tuple:
    """
    Perform grid search to find the best hyperparameters for a given model.

    Parameters:
    model (object): The machine learning model from the sci kit learn library to be used for grid search.
    search_space (dict): The hyperparameter search space.
    X (pd.DataFrame): The input features.
    Y (pd.Series): The target variable.
    verbose (int, optional): Verbosity level. Default is 1.

    Returns:
    tuple: A tuple containing the best score, best parameters, and the results of the grid search.

    """
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=search_space,
        scoring=["accuracy", "precision", "recall", "roc_auc"],
        refit="roc_auc",
        cv=5,
        verbose=verbose,
    )

    grid_search.fit(X, Y)
    results = pd.DataFrame(grid_search.cv_results_)
    return grid_search.best_score_, grid_search.best_params_, results

# Wine Quality Dataset

## Dataset Preparation

In [None]:
# read and explore the dataset
wine_data = pd.read_csv("datasets/winequality_red.csv")

In [None]:
wine_data.head()

In [None]:
wine_data.describe()

In [None]:
wine_data.corr()

In [None]:
# Remove rows with empty data on the label
wine_data.dropna(axis=0, subset=["quality"], inplace=True)

# Separate data
y_wine = wine_data["quality"]
X_wine = wine_data.drop(columns=["quality"])
X_wine.describe()

# Modify quality labels to 1 or 0
y_wine = y_wine.apply(lambda x: 1 if x > 6 else 0)
y_wine = y_wine.values

# Scale the data
normalizer = StandardScaler()
X_wine_normalized = pd.DataFrame(normalizer.fit_transform(X_wine))
# after we transform the data, we should put back the column names
X_wine_normalized.columns = X_wine.columns.astype(str)

## Logistic Regression

In [None]:
# Parameters selection
search_space_log = {
    "fit_intercept": [True, False],  # Add bias constant or not
    "solver": ["lbfgs", "liblinear", "newton-cholesky", "newton-cg", "sag", "saga"],
    "max_iter": [1, 5, 10, 30, 50, 100],
}

log_classifier = LogisticRegression(random_state=1)
best_log_score, best_log_params, results_log = make_grid_search(
    model=log_classifier, search_space=search_space_log, X=X_wine_normalized, Y=y_wine
)

In [None]:
print("Tuned hyperparameters (best parameters): ", best_log_params)
print("Best score:", best_log_score)

## Decision Trees

In [None]:
# Parameters selection
search_space_tree = {
    "criterion": ["gini", "entropy", "log_loss"],  # Loss criteria
    "splitter": ["best", "random"],  # Type of split for the nodes
    "max_depth": [1, 5, 10, 30, 50, 100],  # Maxiumun tree depth
    "min_samples_split": [
        1,
        2,
        5,
        10,
    ],  # Minimun number of samples needed to split a node
}

tree_classifier = DecisionTreeClassifier(random_state=1)
best_tree_score, best_tree_params, tree_results = make_grid_search(
    model=tree_classifier, search_space=search_space_tree, X=X_wine_normalized, Y=y_wine
)

In [None]:
print("Tuned hyperparameters (best parameters): ", best_tree_params)
print("Best score :", best_tree_score)

## kNN

In [None]:
knn_search_space = {
    "n_neighbors": [3, 5, 7, 9, 11],
    "weights": ["uniform", "distance"],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
    "p": [1, 2],
}

knn_classifier = KNeighborsClassifier()

best_knn_score, best_knn_params, results_knn = make_grid_search(
    model=knn_classifier, search_space=knn_search_space, X=X_wine_normalized, Y=y_wine
)

In [None]:
print("Tuned hyperparameters (best parameters): ", best_knn_params)
print("Best score :", best_knn_score)

## Neural Network

In [None]:
# Parameters selection
search_space_mlp = {
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver": ["lbfgs", "sgd", "adams"],
    "learning_rate": ["constant", "invscaling", "adaptive"],
    "warm_start": [True, False],
    "early_stopping": [True, False],
}

mlp_classifier = MLPClassifier(random_state=1)
best_mlp_score, best_mlp_params, mlp_results = make_grid_search(
    model=mlp_classifier, search_space=search_space_mlp, X=X_wine_normalized, Y=y_wine
)

In [None]:
print("Tuned hyperparameters (best parameters): ", best_mlp_params)
print("Best score :", best_mlp_score)

# Credit Card Dataset

## Dataset Preparation

In [None]:
# Read dataset
credit_card_data = pd.read_csv("datasets/credit_card_approvals.csv")
credit_card_data.head()

# Remove empty rows
credit_card_data.dropna(axis=0, subset=["Approved"], inplace=True)

# Separate data
y_credit = credit_card_data["Approved"]
X_credit = credit_card_data.drop(columns=["Approved"])

# Get categorical and numerical columns
cat_cols = X_credit.select_dtypes(include="object").columns.tolist()
num_cols = X_credit.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Normalize numerical values and transformm categorical ones
pipeline = ColumnTransformer(
    [
        ("numerical", StandardScaler(), num_cols),
        ("categorical", OrdinalEncoder(), cat_cols),
    ]
)

X_credit_normalized = pd.DataFrame(pipeline.fit_transform(X_credit))
X_credit_normalized.columns = num_cols + cat_cols

## Logistic Regression

## Decision Trees

## kNN

## Neural Network