## Proyecto 1 - Aprendizaje Automático
### Integrantes:
- A. Badilla Olivas B80874
- Enrique Vilchez Lizano C18477
- Brandon Mora Umaña 
- Joseph Valverde Kong C18100

In [None]:
# Import necessary modules
import numpy as np
# Models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Utils
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

from sklearn.model_selection import GridSearchCV, train_test_split

# Wine Quality Dataset

## Dataset Preparation

In [None]:
# Read dataset
wine_data = pd.read_csv("datasets/winequality_red.csv")
wine_data.head()
wine_data.corr()

# Remove rows with empty data
wine_data.dropna(axis=0, subset=["quality"], inplace=True)

# Separate data
y_wine = wine_data["quality"]
X_wine = wine_data.drop(columns=["quality"])
X_wine.describe()

# Modify quality labels to 1 or 0
y_wine = y_wine.apply(lambda x: 1 if x > 6 else 0)
y_wine = y_wine.values

# Normalize data
normalizer = StandardScaler()
X_wine_normalized = pd.DataFrame(normalizer.fit_transform(X_wine))
X_wine_normalized.columns = X_wine.columns.astype(str)
X_wine_normalized


## Logistic Regression

In [None]:
# Parameters selection
search_space_log = {
    "fit_intercept": [True, False],  # Add bias constant or not
    "solver": ['lbfgs', 'liblinear', 'newton-cholesky', 'newton-cg', 'sag', 'saga'],
    'max_iter': [1, 5, 10, 30, 50, 100]
}

log_classifier = LogisticRegression(random_state=1)
grid_search_log = GridSearchCV(
    estimator=log_classifier,
    param_grid=search_space_log,
    scoring=["accuracy", "precision", "recall", "roc_auc"],
    refit="roc_auc",
    cv=5,
    verbose=1,
)

grid_search_log.fit(X_wine_normalized, y_wine)
result = pd.DataFrame(grid_search_log.cv_results_)

In [None]:
print("Tuned hyperparameters (best parameters): ", grid_search_log.best_params_)
print("Best score:",grid_search_log.best_score_)

## Decision Trees

In [None]:
# Parameters selection
search_space_tree = {
    "criterion": ['gini', 'entropy', 'log_loss'],  # Loss criteria
    "splitter": ['best', 'random'], # Type of split for the nodes
    "max_depth": [1, 5, 10, 30, 50, 100], # Maxiumun tree depth
    "min_samples_split": [1, 2, 5, 10] # Minimun number of samples needed to split a node
}

tree_classifier = DecisionTreeClassifier(random_state=1)
grid_search_tree = GridSearchCV(
    estimator=tree_classifier,
    param_grid=search_space_tree,
    scoring=["accuracy", "precision", "recall", "roc_auc"],
    refit="roc_auc",
    cv=5,
    verbose=1,
)

grid_search_tree.fit(X_wine_normalized, y_wine)
results_tree = pd.DataFrame(grid_search_tree.cv_results_)

In [None]:
print("Tuned hyperparameters (best parameters): ",grid_search_tree.best_params_)
print("Best score :",grid_search_tree.best_score_)

## kNN

In [None]:
knn_search_space = {
    "n_neighbors": [3, 5, 7, 9, 11],
    "weights": ['uniform', 'distance'],
    "algorithm": ['ball_tree', 'kd_tree', 'brute'],
    "p": [1, 2]
}

knn_classifier = KNeighborsClassifier()
knn_search_grid = GridSearchCV(
    estimator=knn_classifier,
    param_grid=knn_search_space,
    scoring=["accuracy", "precision", "recall", "roc_auc"],
    refit="roc_auc",
    cv=5,
    verbose=1,
)

knn_search_grid.fit(X_wine_normalized, y_wine)
results_knn = pd.DataFrame(knn_search_grid.cv_results_)

In [None]:
print("Tuned hyperparameters (best parameters): ", knn_search_grid.best_params_)
print("Best score :", knn_search_grid.best_score_)

## Neural Network

In [None]:
# Parameters selection
search_space_mlp = {
    'activation': ['identity', 'logistic', 'tanh', 'relu'], 
    'solver': ['lbfgs', 'sgd', 'adams'], 
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'warm_start': [True, False], 
    'early_stopping': [True, False],
}
# L2, l1 => hacer le model mas, 1. acercar los pesos a 0, l2, obligar pesos a 0

mlp_classifier = MLPClassifier(random_state=1)
grid_search_mlp = GridSearchCV(
    estimator=mlp_classifier,
    param_grid=search_space_mlp,
    scoring=["accuracy", "precision", "recall", "roc_auc"],
    refit="roc_auc",
    cv=5,
    n_jobs=6,
    verbose=1,
)

grid_search_mlp.fit(X_wine_normalized, y_wine)
result_mlp = pd.DataFrame(grid_search_mlp.cv_results_)

In [None]:
print("Tuned hyperparameters (best parameters): ", grid_search_mlp.best_params_)
print("Best score :", grid_search_mlp.best_score_)

# Credit Card Dataset

## Dataset Preparation

In [None]:
# Read dataset
credit_card_data = pd.read_csv("datasets/credit_card_approvals.csv")
credit_card_data.head()

# Remove empty rows
credit_card_data.dropna(axis=0, subset=["Approved"], inplace=True)

# Separate data
y_credit = credit_card_data["Approved"]
X_credit = credit_card_data.drop(columns=["Approved"])

# Get categorical and numerical columns
cat_cols = X_credit.select_dtypes(include="object").columns.tolist()
num_cols = X_credit.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Normalize numerical values and transformm categorical ones
pipeline = ColumnTransformer(
    [
        ("numerical", StandardScaler(), num_cols),
        ("categorical", OrdinalEncoder(), cat_cols),
    ]
)

X_credit_normalized = pd.DataFrame(pipeline.fit_transform(X_credit))
X_credit_normalized.columns = num_cols + cat_cols
X_credit_normalized

## Logistic Regression

## Decision Trees

## kNN

## Neural Network