[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FM11pp3/VC_0312/blob/main/Untitled0.ipynb)



# VC_0312 - Notebook arrumado
Notebook dividido em: Configuracao -> Parte A (analise exploratoria + augmentations) -> Parte B (pesos pre-treinados para validar/testar) -> Anexos (treino + push GitHub).

**Como correr**
- Ajusta `DATA_DIR` se nao estiveres em Colab.
- Executa as celulas por ordem: Configuracao -> Parte A -> Parte B.
- As celulas de Anexos sao opcionais para treino de raiz e push.

## Configuracao

In [None]:
from pathlib import Path
import random
import zipfile
import urllib.request
import json

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from torchvision.transforms import functional as TF
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42

def seed_everything(seed: int = SEED) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything()
print(f"Device: {DEVICE} | Seed: {SEED}")


In [None]:
# Caminhos principais e dataset
REPO_ROOT = Path(".").resolve()
DATA_DIR = Path("/content/InfraredSolarModules") if Path("/content").exists() else REPO_ROOT / "InfraredSolarModules"
DATA_URL = "https://github.com/RaptorMaps/InfraredSolarModules/raw/master/2020-02-14_InfraredSolarModules.zip"
BASE_IMAGE_DIR = DATA_DIR / "images"
MODELS_DIR = REPO_ROOT / "models"
METRICS_DIR = REPO_ROOT / "metrics"
TRAIN_CSV = REPO_ROOT / "full_train_data_list.csv"
TEST_CSV = REPO_ROOT / "final_test_data_list.csv"

def ensure_dataset() -> None:
    """Descarrega o dataset apenas se nao existir localmente."""
    if BASE_IMAGE_DIR.exists():
        print(f"?? Dataset pronto em {BASE_IMAGE_DIR}")
        return
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    zip_path = DATA_DIR.with_suffix(".zip")
    print("?? A descarregar InfraredSolarModules (pode demorar)...")
    urllib.request.urlretrieve(DATA_URL, zip_path)
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(DATA_DIR.parent)
    print(f"?? Dataset extraido para {DATA_DIR}")

def load_dataframes(image_dir: Path = BASE_IMAGE_DIR):
    train_df = pd.read_csv(TRAIN_CSV)
    test_df = pd.read_csv(TEST_CSV)
    for df in (train_df, test_df):
        df["filename"] = df["path"].apply(lambda p: Path(p).name)
        df["path"] = df["filename"].apply(lambda n: image_dir / n)
    class_pairs = train_df[["class_name", "label"]].drop_duplicates().sort_values("label")
    classes_map = {row.class_name: int(row.label) for row in class_pairs.itertuples()}
    idx_to_class = {v: k for k, v in classes_map.items()}
    return train_df, test_df, classes_map, idx_to_class

ensure_dataset()
train_df, test_df, classes_map, idx_to_class = load_dataframes()
print(f"Train imgs: {len(train_df):,} | Test imgs: {len(test_df):,}")
display(train_df.head())


## Parte A - Analise exploratoria

In [None]:
# Distribuicao de classes (dataset original esta desbalanceado)
order = train_df["class_name"].value_counts().index
fig, ax = plt.subplots(figsize=(8, 6))
sns.countplot(data=train_df, y="class_name", order=order, palette="viridis", ax=ax)
ax.set_title("Distribuicao de classes (train)")
ax.bar_label(ax.containers[0], fontsize=8)
plt.tight_layout()
plt.show()


In [None]:
# Visualizar 1 imagem aleatoria + augmentations basicas
sample = train_df.sample(1, random_state=SEED).iloc[0]
img_path = Path(sample["path"])
if not img_path.exists():
    raise FileNotFoundError(f"Imagem nao encontrada: {img_path}. Confirma a celula de download do dataset.")

img = Image.open(img_path).convert("L")
augs = {
    "original": lambda im: im,
    "flip_h": lambda im: TF.hflip(im),
    "flip_v": lambda im: TF.vflip(im),
    "rotate_20": lambda im: TF.rotate(im, angle=20),
    "center_crop": lambda im: TF.center_crop(im, output_size=(int(im.height * 0.8), int(im.width * 0.8))),
}

fig, axes = plt.subplots(1, len(augs), figsize=(15, 4))
for ax, (name, fn) in zip(axes, augs.items()):
    ax.imshow(fn(img), cmap="gray")
    ax.set_title(name)
    ax.axis("off")
plt.suptitle(f"Classe: {sample['class_name']} | ficheiro: {img_path.name}")
plt.tight_layout()
plt.show()


In [None]:
# Transforms padrao usados nos DataLoaders
IMAGE_SIZE = (64, 64)
train_transform = T.Compose([
    T.Resize(IMAGE_SIZE),
    T.Grayscale(),
    T.RandomHorizontalFlip(),
    T.RandomVerticalFlip(),
    T.RandomRotation(20),
    T.ToTensor(),
    T.Normalize(mean=[0.5], std=[0.5]),
])

test_transform = T.Compose([
    T.Resize(IMAGE_SIZE),
    T.Grayscale(),
    T.ToTensor(),
    T.Normalize(mean=[0.5], std=[0.5]),
])
print("Transforms definidos (train/test).")


## Parte B - Pesos pre-treinados: validar/testar

In [None]:
# Garantir que os pesos dos modelos estao disponiveis
WEIGHT_URLS = {
    "model_A_final.pth": "https://raw.githubusercontent.com/FM11pp3/VC_0312/main/models/model_A_final.pth",
    "model_B_final.pth": "https://raw.githubusercontent.com/FM11pp3/VC_0312/main/models/model_B_final.pth",
    "model_C_final.pth": "https://raw.githubusercontent.com/FM11pp3/VC_0312/main/models/model_C_final.pth",
}
MODELS_DIR.mkdir(exist_ok=True)

def ensure_weights():
    for fname, url in WEIGHT_URLS.items():
        dest = MODELS_DIR / fname
        if dest.exists():
            print(f"?? {fname} ja existe")
            continue
        print(f"?? A descarregar {fname}...")
        urllib.request.urlretrieve(url, dest)
    print("Pronto.")

ensure_weights()


In [None]:
# Dataset, modelo e helpers para avaliacao
class SolarDataset(Dataset):
    def __init__(self, df: pd.DataFrame, transform):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row["path"]).convert("L")
        if self.transform:
            image = self.transform(image)
        return image, int(row["label"])

def make_loader(df, transform, batch_size=256, shuffle=False):
    return DataLoader(
        SolarDataset(df, transform),
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=2,
        pin_memory=torch.cuda.is_available(),
    )

class NetworkCNN(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)

        dummy_input = torch.randn(1, 1, 64, 64)
        with torch.no_grad():
            x = self.pool(F.relu(self.conv1(dummy_input)))
            x = self.pool(F.relu(self.conv2(x)))
            flattened_size = torch.flatten(x, 1).shape[1]

        self.fc1 = nn.Linear(flattened_size, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def evaluate_model(model: nn.Module, loader: DataLoader) -> float:
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            preds = model(images).argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.numel()
    return correct / total if total else 0.0

# Preparar dataframes para os 3 modelos
anomaly_classes = sorted([c for c in classes_map if c != "No-Anomaly"])
classes_map_B = {cls: idx for idx, cls in enumerate(anomaly_classes)}

model_frames = {
    "A": {
        "num_classes": 2,
        "df": test_df.assign(label=test_df["class_name"].apply(lambda c: 0 if c == "No-Anomaly" else 1)),
        "weights": MODELS_DIR / "model_A_final.pth",
    },
    "B": {
        "num_classes": len(anomaly_classes),
        "df": test_df[test_df["class_name"] != "No-Anomaly"].assign(label=lambda d: d["class_name"].map(classes_map_B)),
        "weights": MODELS_DIR / "model_B_final.pth",
    },
    "C": {
        "num_classes": len(classes_map),
        "df": test_df.assign(label=lambda d: d["class_name"].map(classes_map)),
        "weights": MODELS_DIR / "model_C_final.pth",
    },
}

results = []
for key, cfg in model_frames.items():
    loader = make_loader(cfg["df"], test_transform, batch_size=256)
    model = NetworkCNN(cfg["num_classes"]).to(DEVICE)
    state = torch.load(cfg["weights"], map_location=DEVICE)
    model.load_state_dict(state)
    acc = evaluate_model(model, loader)
    results.append({"Model": f"Model {key}", "test_accuracy": acc})
    print(f"Model {key}: test accuracy = {acc:.3f}")

results_df = pd.DataFrame(results)
display(results_df)

metrics_path = METRICS_DIR / "final_test_metrics.csv"
if metrics_path.exists():
    print("Metricas exportadas no treino original:")
    display(pd.read_csv(metrics_path))


## Anexos - Treino de raiz e push para GitHub

In [None]:
# Treino rapido (exemplo) ? usa o modelo C (12 classes) como base
from sklearn.model_selection import train_test_split

def train_model(model_name: str, base_df: pd.DataFrame, num_classes: int, epochs: int = 3, lr: float = 1e-3):
    train_split, val_split = train_test_split(base_df, test_size=0.2, stratify=base_df["label"], random_state=SEED)
    train_loader = make_loader(train_split, train_transform, batch_size=128, shuffle=True)
    val_loader = make_loader(val_split, test_transform, batch_size=256)

    model = NetworkCNN(num_classes).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            loss = criterion(model(images), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * labels.size(0)
        val_acc = evaluate_model(model, val_loader)
        print(f"Epoch {epoch + 1}/{epochs} | loss={running_loss / len(train_loader.dataset):.4f} | val_acc={val_acc:.3f}")

    out_path = MODELS_DIR / f"{model_name}.pth"
    torch.save(model.state_dict(), out_path)
    print(f"Modelo guardado em {out_path}")
    return model

# Exemplo (comenta se nao quiseres treinar no notebook):
# trained_model = train_model("model_C_scratch", test_df.assign(label=lambda d: d["class_name"].map(classes_map)), num_classes=len(classes_map), epochs=5)


In [None]:
# Push rapido dos artefactos (usa HTTPS). Configura antes: git config user.email/name e token de acesso se precisa.
# Descomenta as linhas abaixo quando estiveres autenticado.
# !git status
# !git add models/*.pth metrics/*.csv
# !git commit -m "Add modelos e metricas atualizadas"
# !git push origin main
