<a href="https://colab.research.google.com/github/DiogoFerreiraAlves02/MSC/blob/main/MSC2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Exercício 2 - Realizado por Diogo Alves 748 & Joana Ferreira 749 & Adriana Sousa 11006 & João Soares 11011 & Pedro Soares 11004**

**a. / b.**

In [12]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

# Carregar o dataset
url = "https://raw.githubusercontent.com/zfpro/msc22-23/main/Introduction%20to%20scikit-learn/datasets/adult-census.csv"
dataset = pd.read_csv(url)

# Codificação one-hot das variáveis categóricas
X = pd.get_dummies(dataset.drop('class', axis=1))
y = dataset['class']

# Dividir o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Criar o classificador de árvore de decisão com max_depth = 5
tree = DecisionTreeClassifier(max_depth=5)

# Treinar a árvore de decisão
tree.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = tree.predict(X_test)

# Calcular a accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8579813007575241


**c.**

In [13]:
def display_confusion_matrix(model, X_test, y_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot()
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)


**d.**

In [14]:
def plot_roc_curve(model, X_test, y_test):
    y_scores = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    auc = roc_auc_score(y_test, y_scores)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()
    print("AUC:", auc)


**e.**

In [15]:
def holdout_estimation(model, X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

**f.**

In [16]:
def cv_estimation(model, X, y, k):
    scores = cross_val_score(model, X, y, cv=k)
    mean_accuracy = scores.mean()
    std_accuracy = scores.std()
    return mean_accuracy, std_accuracy

**g.**

In [17]:
hyperparameters = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3],
    'min_impurity_decrease': [0.0, 0.1, 0.2]
}

for max_depth in hyperparameters['max_depth']:
    for min_samples_split in hyperparameters['min_samples_split']:
        for min_samples_leaf in hyperparameters['min_samples_leaf']:
            for min_impurity_decrease in hyperparameters['min_impurity_decrease']:
                tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease)
                accuracy_holdout = holdout_estimation(tree, X, y, 0.3)
                mean_accuracy_cv, std_accuracy_cv = cv_estimation(tree, X, y, 5)
                print("max_depth:", max_depth, "min_samples_split:", min_samples_split, "min_samples_leaf:", min_samples_leaf, "min_impurity_decrease:", min_impurity_decrease)
                print("Accuracy (Holdout):", accuracy_holdout)
                print("Accuracy (CV):", mean_accuracy_cv, "+/-", std_accuracy_cv)
                print()

max_depth: 3 min_samples_split: 2 min_samples_leaf: 1 min_impurity_decrease: 0.0
Accuracy (Holdout): 0.8506107964239404
Accuracy (CV): 0.8431882819921513 +/- 0.0019447484696599726

max_depth: 3 min_samples_split: 2 min_samples_leaf: 1 min_impurity_decrease: 0.1
Accuracy (Holdout): 0.766600696103187
Accuracy (CV): 0.7607182362198229 +/- 3.815022657841085e-05

max_depth: 3 min_samples_split: 2 min_samples_leaf: 1 min_impurity_decrease: 0.2
Accuracy (Holdout): 0.766600696103187
Accuracy (CV): 0.7607182362198229 +/- 3.815022657841085e-05

max_depth: 3 min_samples_split: 2 min_samples_leaf: 2 min_impurity_decrease: 0.0
Accuracy (Holdout): 0.8506107964239404
Accuracy (CV): 0.8431882819921513 +/- 0.0019447484696599726

max_depth: 3 min_samples_split: 2 min_samples_leaf: 2 min_impurity_decrease: 0.1
Accuracy (Holdout): 0.766600696103187
Accuracy (CV): 0.7607182362198229 +/- 3.815022657841085e-05

max_depth: 3 min_samples_split: 2 min_samples_leaf: 2 min_impurity_decrease: 0.2
Accuracy (Holdout