In [5]:
import numpy as np
import pandas as pd

In [101]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [7]:
df = pd.read_csv('dados_tempo.csv')

In [None]:
display(df.head()) # primeiras linhas do dataset
display(df.shape) # linhas e colunas
display(df.dtypes) # colunas e seus tipos
display(df.isna().sum()) # valores ausentes por coluna

In [None]:
df.describe() # estatisticas descritivas
df['chuva_amanha'].value_counts() # quantas vezes apareceu

In [None]:
df.drop_duplicates() # removendo duplicatas

In [65]:
target_col = "chuva_amanha"

y = df[target_col]
X = df.drop(columns=[target_col])

# separação de variaveis
numericas = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categoricas = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [66]:
# codificação
def pandas_onehot_encoder(X):
    X_cat = pd.get_dummies(X, drop_first=False)
    return X_cat.values

# escalonamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numericas),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categoricas)
    ]
)

In [67]:
# Separando em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
modelos = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42)
}

for nome, model in modelos.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(f"\n=== {nome} ===")
    print("Acurácia:", accuracy_score(y_test, y_pred))
    print("Relatório de Classificação:")
    print(classification_report(y_test, y_pred))

In [103]:
# otimização de superparametros

modelo_base = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline([  # optei por não usar a função e reutilizar
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

param_grid = {
        "model__n_estimators": [50, 100, 200],
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_split": [2, 5, 10],
    }

grid_search = GridSearchCV(
        estimator=pipe_rf,
        param_grid=param_grid,
        cv=3,
        n_jobs=-1,
        scoring="accuracy"
    )

grid_search.fit(X_train, y_train)

In [104]:
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
best_acc = accuracy_score(y_test, y_pred_best)

print(f"\nAcurácia do melhor modelo (RandomForest + GridSearch): {best_acc:.4f}")
print("Matriz de confusão (melhor modelo):")
print(confusion_matrix(y_test, y_pred_best))
print("Classification report (melhor modelo):")
print(classification_report(y_test, y_pred_best))

if hasattr(best_model.named_steps["model"], "predict_proba"):
        y_proba_best = best_model.predict_proba(X_test)[:, 1]
        best_roc = roc_auc_score(y_test, y_proba_best)
        print(f"ROC-AUC (melhor modelo): {best_roc:.4f}")



Acurácia do melhor modelo (RandomForest + GridSearch): 0.9375
Matriz de confusão (melhor modelo):
[[9 0]
 [1 6]]
Classification report (melhor modelo):
              precision    recall  f1-score   support

           0       0.90      1.00      0.95         9
           1       1.00      0.86      0.92         7

    accuracy                           0.94        16
   macro avg       0.95      0.93      0.94        16
weighted avg       0.94      0.94      0.94        16

ROC-AUC (melhor modelo): 1.0000


In [95]:
if best_acc >= 0.92:
    print("✅ Passou no teste (acurácia >= 0.92)")
else:
    print("❌ Não passou no teste (acurácia < 0.92)")

✅ Passou no teste (acurácia >= 0.92)


In [None]:
# CURVA ROC
y_proba = grid_search.predict_proba(X_test)[:, 1]  # pega a coluna da classe 1

fp, vp, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fp, vp)

plt.figure(figsize=(8,6))
plt.plot(vp, fp, color='blue', label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Falso positivo (FP)')
plt.ylabel('Verdadeiro Positivo (VP)')
plt.title('Curva ROC')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Matrix de confusão
# para ver se o modelo está enviesado
y_pred = grid_search.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Não','Sim'], yticklabels=['Não','Sim'])
plt.xlabel('Predito')
plt.ylabel('Real')
plt.title('Matriz de Confusão')
plt.show()