<a href="https://colab.research.google.com/github/AlexiaCordeiro/Inteligencia-Computacional/blob/master/PRATICA_03_IC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')


In [None]:
import sys, subprocess, os, shutil, zipfile, json, warnings

def pip_install(pkg):
    try:
        __import__(pkg.split("==")[0].replace("-", "_"))
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

for pkg in ["pandas", "numpy", "scikit-learn", "joblib", "tqdm", "kaggle"]:
    pip_install(pkg)

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate # Adicionei o cross validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle as sk_shuffle
from sklearn.model_selection import ParameterGrid
from sklearn.decomposition import PCA
from joblib import dump

warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [None]:
def exists_all(paths):
    return all(os.path.exists(p) for p in paths)

def try_kaggle_download(comp="santander-customer-transaction-prediction"):
    kaggle_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
    os.makedirs(kaggle_dir, exist_ok=True)
    kj_path = os.path.join(kaggle_dir, "kaggle.json")
    if not os.path.exists(kj_path) and "KAGGLE_USERNAME" not in os.environ:
        try:
            from google.colab import files
            print("Envie seu kaggle.json (Kaggle > Account > Create New API Token)...")
            uploaded = files.upload()
            if uploaded:
                up_name = list(uploaded.keys())[0]
                shutil.move(up_name, kj_path)
                os.chmod(kj_path, 0o600)
                print("kaggle.json salvo em ~/.kaggle/")
        except Exception:
            print("Upload não disponível (talvez não esteja no Colab).")
    try:
        zip_name = f"{comp}.zip"
        print("Baixando dados via Kaggle API...")
        subprocess.check_call([sys.executable, "-m", "kaggle", "competitions", "download", "-c", comp])
        if os.path.exists(zip_name):
            with zipfile.ZipFile(zip_name, "r") as zf:
                zf.extractall(".")
            print("Arquivos extraídos com sucesso.")
        else:
            print(f"Aviso: {zip_name} não encontrado após o download.")
    except Exception as e:
        print("\n[AVISO] Falha no download automático pelo Kaggle.")
        print("Possíveis causas: (1) Regras não aceitas; (2) Credenciais ausentes/incorretas.")
        print("Alternativas: faça upload manual de train.csv/test.csv/sample_submission.csv para o diretório atual.")
        raise

In [None]:
def auc_acc(y_true, y_prob, y_pred=None):
    if y_pred is None:
        y_pred = (y_prob >= 0.5).astype(int)
    return roc_auc_score(y_true, y_prob), accuracy_score(y_true, y_pred)

def evaluate_model(clf, X_tr, y_tr, X_va, y_va):
    clf.fit(X_tr, y_tr)
    if hasattr(clf, "predict_proba"):
        y_prob = clf.predict_proba(X_va)[:, 1]
    else:
        y_prob = clf.decision_function(X_va) if hasattr(clf, "decision_function") else clf.predict(X_va).astype(float)
    y_pred = clf.predict(X_va)
    AUC, ACC = auc_acc(y_va, y_prob, y_pred)
    return AUC, ACC, clf

def cv_score(clf, X, y, cv=3, scoring="roc_auc"):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    aucs = []
    for tr_idx, va_idx in skf.split(X, y):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        clf.fit(X_tr, y_tr)
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_va)[:, 1]
        else:
            y_prob = clf.decision_function(X_va) if hasattr(clf, "decision_function") else clf.predict(X_va).astype(float)
        aucs.append(roc_auc_score(y_va, y_prob))
    return float(np.mean(aucs))


In [26]:
def grid_search_with_tqdm(estimator_builder, param_grid, X, y, cv=3, desc="GridSearch"):
    grid = list(ParameterGrid(param_grid))
    best_score, best_params = -np.inf, None
    for params in tqdm(grid, desc=desc, total=len(grid)):
        clf = estimator_builder(params)
        score = cv_score(clf, X, y, cv=cv, scoring="roc_auc")
        if score > best_score:
            best_score, best_params = score, params
    best_estimator = estimator_builder(best_params)
    best_estimator.fit(X, y)
    return best_estimator, best_params, best_score

DATA_FILES = ["train.csv", "test.csv", "sample_submission.csv"]
if not exists_all(DATA_FILES):
    print("Arquivos não encontrados em ./ — tentando baixar do Kaggle.")
    try_kaggle_download()

assert exists_all(DATA_FILES), "Necessário train.csv, test.csv e sample_submission.csv no diretório atual."

df_train = pd.read_csv("train.csv")
df_test  = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")

assert "target" in df_train.columns, "Coluna 'target' ausente em train.csv"
assert "ID_code" in df_train.columns and "ID_code" in df_test.columns, "Coluna 'ID_code' ausente."

X_full = df_train.drop(columns=["ID_code", "target"])
y_full = df_train["target"].astype(int)
X_test = df_test.drop(columns=["ID_code"])
id_test = df_test["ID_code"]

print("Shapes -> X_full:", X_full.shape, "| y_full:", y_full.shape, "| X_test:", X_test.shape)
print("Proporção classe positiva (train):", y_full.mean().round(4))

X_tr, X_va, y_tr, y_va = train_test_split(
    X_full, y_full, test_size=0.2, stratify=y_full, random_state=RANDOM_STATE
)
print("Split local -> Train:", X_tr.shape, "| Val:", X_va.shape)

results_baseline = {}

tree_base = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=3)
AUC, ACC, _ = evaluate_model(tree_base, X_tr, y_tr, X_va, y_va)
results_baseline["DecisionTree"] = {"auc": AUC, "acc": ACC, "model": tree_base}

nb_base = GaussianNB()
AUC, ACC, _ = evaluate_model(nb_base, X_tr, y_tr, X_va, y_va)
results_baseline["NaiveBayes"] = {"auc": AUC, "acc": ACC, "model": nb_base}

knn_base = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=60, random_state=RANDOM_STATE)),
    ("knn", KNeighborsClassifier(n_neighbors=5, n_jobs=-1, weights="uniform", p=2))
])
AUC, ACC, _ = evaluate_model(knn_base, X_tr, y_tr, X_va, y_va)
results_baseline["KNN"] = {"auc": AUC, "acc": ACC, "model": knn_base}

print("\n== Baselines (holdout local) ==")
for k, v in results_baseline.items():
    print(f"{k:15s} | AUC: {v['auc']:.6f} | ACC: {v['acc']:.6f}")

results_tuned = {}

Shapes -> X_full: (200000, 200) | y_full: (200000,) | X_test: (200000, 200)
Proporção classe positiva (train): 0.1005
Split local -> Train: (160000, 200) | Val: (40000, 200)


KeyboardInterrupt: 

In [None]:
def build_tree(params):
    params = params or {}
    return DecisionTreeClassifier(random_state=RANDOM_STATE, **params)

def build_nb(params):
    params = params or {}
    return GaussianNB(**params)

def build_knn(params):
    params = params or {}
    default = dict(n_jobs=-1, weights="uniform", p=2)
    default.update(params)
    return Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=60, random_state=RANDOM_STATE)),
        ("knn", KNeighborsClassifier(**default))
    ])

tree_grid = {
    "max_depth": [3, 4, 5, 6, None],
    "min_samples_split": [2, 10, 20, 50],
    "min_samples_leaf": [1, 5, 10, 20],
}
nb_grid = {"var_smoothing": np.logspace(-12, -7, 6)}

knn_grid = {
    "n_neighbors": [3, 5, 9, 15],
}

print("\n== Tuning (com barras de progresso) ==")
tree_best, tree_params, tree_cvauc = grid_search_with_tqdm(
    build_tree, tree_grid, X_tr, y_tr, cv=3, desc="Grid: DecisionTree"
)
nb_best, nb_params, nb_cvauc = grid_search_with_tqdm(
    build_nb, nb_grid, X_tr, y_tr, cv=3, desc="Grid: NaiveBayes"
)
knn_best, knn_params, knn_cvauc = grid_search_with_tqdm(
    build_knn, knn_grid, X_tr, y_tr, cv=2, desc="Grid: KNN (fast)"
)

for name, best_model, params in [
    ("DecisionTree", tree_best, tree_params),
    ("NaiveBayes",   nb_best,   nb_params),
    ("KNN",          knn_best,  knn_params),
]:
    AUC, ACC, _ = evaluate_model(best_model, X_tr, y_tr, X_va, y_va)
    results_tuned[name] = {"auc": AUC, "acc": ACC, "model": best_model, "best_params": params}

print("\n== Ajustados (holdout local) ==")
for k, v in results_tuned.items():
    print(f"{k:15s} | AUC: {v['auc']:.6f} | ACC: {v['acc']:.6f} | best_params={v['best_params']}")

rows = []
for name in ["DecisionTree", "NaiveBayes", "KNN"]:
    base = results_baseline[name]
    tun  = results_tuned[name]
    rows.append({
        "Algoritmo": name,
        "Avaliacao Local (antes) [AUC]": round(base["auc"], 6),
        "Kaggle (antes) [AUC]": "",
        "Avaliacao Local (ajustado) [AUC]": round(tun["auc"], 6),
        "Kaggle (ajustado) [AUC]": ""
    })
scores_df = pd.DataFrame(rows)
scores_df.to_csv("tabela_comparativa_local_vs_kaggle.csv", index=False)
print("\nTabela salva em: tabela_comparativa_local_vs_kaggle.csv")
display(scores_df)




In [None]:
def make_submission(model, X_train_full, y_train_full, X_test, id_series, fname):
    model.fit(X_train_full, y_train_full)
    if hasattr(model, "predict_proba"):
        preds = model.predict_proba(X_test)[:, 1]
    else:
        preds = model.predict(X_test).astype(float)
    sub = pd.DataFrame({"ID_code": id_series, "target": preds})
    sub.to_csv(fname, index=False)
    return fname

print("\n== Gerando submissões (com tqdm) ==")
subs_info = []
subs_info.append(make_submission(DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=3),
                                 X_full, y_full, X_test, id_test, "sub_decisiontree_baseline.csv"))
subs_info.append(make_submission(GaussianNB(),
                                 X_full, y_full, X_test, id_test, "sub_naivebayes_baseline.csv"))
subs_info.append(make_submission(Pipeline([
                                     ("scaler", StandardScaler()),
                                     ("pca", PCA(n_components=60, random_state=RANDOM_STATE)),
                                     ("knn", KNeighborsClassifier(n_neighbors=5, n_jobs=-1, weights="uniform", p=2))
                                 ]),
                                 X_full, y_full, X_test, id_test, "sub_knn_baseline.csv"))
subs_info.append(make_submission(results_tuned["DecisionTree"]["model"],
                                 X_full, y_full, X_test, id_test, "sub_decisiontree_tuned.csv"))
subs_info.append(make_submission(results_tuned["NaiveBayes"]["model"],
                                 X_full, y_full, X_test, id_test, "sub_naivebayes_tuned.csv"))
subs_info.append(make_submission(results_tuned["KNN"]["model"],
                                 X_full, y_full, X_test, id_test, "sub_knn_tuned.csv"))

for f in subs_info:
    print("Gerado:", f)

dump(results_tuned["DecisionTree"]["model"], "best_decision_tree.joblib")
dump(results_tuned["NaiveBayes"]["model"],   "best_naive_bayes.joblib")
dump(results_tuned["KNN"]["model"],          "best_knn.joblib")
print("\nModelos salvos: best_decision_tree.joblib, best_naive_bayes.joblib, best_knn.joblib")

best_name = max(results_tuned, key=lambda k: results_tuned[k]["auc"])
comment = (
    f"No holdout local, o melhor AUC foi do {best_name} (ajustado). "
    "O ajuste de hiperparâmetros elevou o AUC em relação aos baselines, "
    "mostrando sensibilidade a profundidade (árvore), vizinhos/peso/distância (kNN) "
    "e var_smoothing (Naive Bayes). "
    "Se o score no Kaggle divergir, causas prováveis são diferença de distribuição "
    "entre o holdout local e o teste público/privado ou overfitting aos hiperparâmetros."
)
with open("comentario_resultados.txt", "w", encoding="utf-8") as f:
    f.write(comment)
print("\ncomentario_resultados.txt:\n", comment)

print("\nPronto. Submeta os 6 CSVs ao Kaggle e preencha as colunas 'Kaggle (antes/ajustado)' na tabela gerada.")