Setup Libraries

In [18]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, random_split,ConcatDataset, Dataset
from sklearn.metrics import f1_score
import pandas as pd
from collections import Counter
from PIL import Image

Datasets Path

In [20]:
data_dir = 'C:/Users/CaioGabrielAdernedeM/OneDrive/IPB/ipb_sistemas_inteligentes/projeto_final/train'
eval_dir = 'C:/Users/CaioGabrielAdernedeM/OneDrive/IPB/ipb_sistemas_inteligentes/projeto_final/evaluation_set'

Load and Split Train Dataset

In [21]:
# Define transformations (resize, normalize, etc.)
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize all images to 224x224
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load dataset
dataset = datasets.ImageFolder(root=data_dir, transform=transform)

# Split dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Calculate split sizes
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Print dataset sizes
print(f"Total dataset size: {dataset_size}")
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

# Function to count labels in a dataset
def count_labels(subset):
    labels = [dataset.targets[idx] for idx in subset.indices]
    label_counts = Counter(labels)
    return {dataset.classes[label]: count for label, count in label_counts.items()}

# Print label counts for each subset
print("Training set label counts:", count_labels(train_dataset))
print("Validation set label counts:", count_labels(val_dataset))
print("Test set label counts:", count_labels(test_dataset))

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Total dataset size: 251
Training set size: 200
Validation set size: 25
Test set size: 26
Training set label counts: {'covidNegative': 113, 'covidPositive': 87}
Validation set label counts: {'covidPositive': 13, 'covidNegative': 12}
Test set label counts: {'covidPositive': 11, 'covidNegative': 15}


In [22]:
# Extract file paths from the datasets
train_files = set([dataset.samples[i][0] for i in train_dataset.indices])
val_files = set([dataset.samples[i][0] for i in val_dataset.indices])
test_files = set([dataset.samples[i][0] for i in test_dataset.indices])

# Check for overlaps
overlap_train_val = train_files.intersection(val_files)
overlap_train_test = train_files.intersection(test_files)
overlap_val_test = val_files.intersection(test_files)

# Print the results
print(f"Number of overlapping samples between train and val: {len(overlap_train_val)}")
print(f"Number of overlapping samples between train and test: {len(overlap_train_test)}")
print(f"Number of overlapping samples between val and test: {len(overlap_val_test)}")

# Print the overlapping files for investigation
if len(overlap_train_val) > 0:
    print("Overlapping files between train and val:")
    print(overlap_train_val)

if len(overlap_train_test) > 0:
    print("Overlapping files between train and test:")
    print(overlap_train_test)

if len(overlap_val_test) > 0:
    print("Overlapping files between val and test:")
    print(overlap_val_test)

Number of overlapping samples between train and val: 0
Number of overlapping samples between train and test: 0
Number of overlapping samples between val and test: 0


Model Definition And it Hyperparameters

In [23]:
# Define a simple feedforward neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x
    
# Define hyperparameters for each model
models_config = [
    {"hidden_size": 64, "learning_rate": 0.001},
    {"hidden_size": 128, "learning_rate": 0.001},
    {"hidden_size": 256, "learning_rate": 0.001},
    {"hidden_size": 64, "learning_rate": 0.0005},
    {"hidden_size": 128, "learning_rate": 0.0005},
    {"hidden_size": 256, "learning_rate": 0.0005},
    {"hidden_size": 64, "learning_rate": 0.0001},
    {"hidden_size": 128, "learning_rate": 0.0001},
    {"hidden_size": 256, "learning_rate": 0.0001}
]

Train, validation and test with 80-10-10

In [24]:
# Function to train and validate a model
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Validation step
        model.eval()
        val_loss = 0.0
        all_labels = []
        all_predictions = []
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs, 1)
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())
        
        # Calculate F1-score for validation
        val_f1 = f1_score(all_labels, all_predictions, average="binary")
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {running_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(val_loader):.4f}, Val F1-Score: {val_f1:.4f}")

# DataFrame to store results
results = []

# Train and evaluate each model for 5, 10, 15, and 20 epochs
input_size = 224 * 224 * 3
output_size = len(dataset.classes)
epoch_list = [5, 10, 15, 20]

for i, config in enumerate(models_config):
    for epochs in epoch_list:
        print(f"\nTraining Model {i+1} with Hidden Size: {config['hidden_size']}, Learning Rate: {config['learning_rate']}, Epochs: {epochs}")
        
        # Initialize model, loss function, and optimizer
        model = SimpleNN(input_size=input_size, hidden_size=config['hidden_size'], output_size=output_size)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
        
        # Train the model
        train_model(model, train_loader, val_loader, criterion, optimizer, epochs)
        
        # Test the model
        model.eval()
        all_labels = []
        all_predictions = []
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to("cuda" if torch.cuda.is_available() else "cpu"), labels.to("cuda" if torch.cuda.is_available() else "cpu")
                outputs = model(images)
                _, predicted = torch.max(outputs, 1)
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())

        # Calculate F1-score for testing
        test_f1 = f1_score(all_labels, all_predictions, average="binary")
        print(f"Model {i+1}, Epochs {epochs}, Test F1-Score: {test_f1:.4f}")

        # Append results to list
        results.append({
            "Model Number": i+1,
            "Hidden Size": config['hidden_size'],
            "Learning Rate": config['learning_rate'],
            "Epochs": epochs,
            "Test F1-Score": test_f1
        })

# Create DataFrame and save results
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)

# Save results to CSV
results_df.to_csv("model_results.csv", index=False)


Training Model 1 with Hidden Size: 64, Learning Rate: 0.001, Epochs: 5
Epoch 1/5, Train Loss: 9.3786, Val Loss: 3.1125, Val F1-Score: 0.8966
Epoch 2/5, Train Loss: 2.4653, Val Loss: 0.8121, Val F1-Score: 0.9231
Epoch 3/5, Train Loss: 1.1136, Val Loss: 3.9687, Val F1-Score: 0.9630
Epoch 4/5, Train Loss: 1.0633, Val Loss: 1.7621, Val F1-Score: 0.9630
Epoch 5/5, Train Loss: 1.2626, Val Loss: 0.8925, Val F1-Score: 0.9630
Model 1, Epochs 5, Test F1-Score: 0.9000

Training Model 1 with Hidden Size: 64, Learning Rate: 0.001, Epochs: 10
Epoch 1/10, Train Loss: 6.7214, Val Loss: 1.9334, Val F1-Score: 0.9630
Epoch 2/10, Train Loss: 1.6921, Val Loss: 6.4150, Val F1-Score: 0.9630
Epoch 3/10, Train Loss: 1.4775, Val Loss: 1.6902, Val F1-Score: 0.9600
Epoch 4/10, Train Loss: 2.4911, Val Loss: 6.1380, Val F1-Score: 0.9286
Epoch 5/10, Train Loss: 0.4021, Val Loss: 4.2675, Val F1-Score: 0.9630
Epoch 6/10, Train Loss: 0.6163, Val Loss: 1.4092, Val F1-Score: 0.9630
Epoch 7/10, Train Loss: 0.4586, Val Lo

Combine 80 train with 10 validation to retrain the model

In [28]:
# Combine training and validation datasets
combined_train_dataset = ConcatDataset([train_dataset, val_dataset])

# Create DataLoaders
combined_train_loader = DataLoader(combined_train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Hyperparameters
params = {"hidden_size": 128, "learning_rate": 0.0001, "epochs": 5}
input_size = 224 * 224 * 3 
output_size = len(dataset.classes)  

# Initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleNN(input_size=input_size, hidden_size=params["hidden_size"], output_size=output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"])

# Training the model
def train_model(model, train_loader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

# Train the model
print("Training the model with the combined training and validation datasets...")
train_model(model, combined_train_loader, criterion, optimizer, epochs=params["epochs"])

# Testing the model
def test_model(model, test_loader):
    model.eval()
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy()) 
            all_predictions.extend(predicted.cpu().numpy())  
    
    # Calculate F1-score
    f1 = f1_score(all_labels, all_predictions, average="binary") 
    print(f"Test F1-Score: {f1:.4f}")
    return f1

# Test the model
print("Testing the model with the test dataset...")
test_f1 = test_model(model, test_loader)

# Save results to a DataFrame
results = pd.DataFrame([{
    "Hidden Size": params["hidden_size"],
    "Learning Rate": params["learning_rate"],
    "Epochs": params["epochs"],
    "Test F1-Score": test_f1
}])

print("\nFinal Results:")
print(results)

# Save the results to a CSV file
results.to_csv("final_model_results.csv", index=False)

Training the model with the combined training and validation datasets...
Epoch 1/5, Loss: 1.0936
Epoch 2/5, Loss: 0.4651
Epoch 3/5, Loss: 0.3514
Epoch 4/5, Loss: 0.2112
Epoch 5/5, Loss: 0.0919
Testing the model with the test dataset...
Test F1-Score: 0.9000

Final Results:
   Hidden Size  Learning Rate  Epochs  Test F1-Score
0          128         0.0001       5            0.9


Evaluate the dataset provided

In [30]:
# Create a custom dataset for the evaluation set
class EvaluationDataset(Dataset): 
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.image_names = [img for img in os.listdir(image_dir) if img.lower().endswith(('.jpg', '.jpeg', '.png'))]
        self.transform = transform

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_names[idx])
        image = Image.open(img_path).convert('RGB')  
        if self.transform:
            image = self.transform(image)
        return image, os.path.splitext(self.image_names[idx])[0]

# Initialize the evaluation dataset
eval_dataset = EvaluationDataset(image_dir=eval_dir, transform=transform)

# Create a DataLoader for the evaluation dataset
eval_loader = DataLoader(eval_dataset, batch_size=1, shuffle=False)

# Classify evaluation images
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

results = []

with torch.no_grad():
    for images, image_name in eval_loader:
        images = images.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)  # Get the predicted label
        results.append((image_name[0], predicted.item()))

# Write the results to a text file
result_file = 'result.txt'
with open(result_file, 'w') as f:
    for img_name, label in results:
        f.write(f"{img_name} {label}\n")

print(f"Classification results saved to {result_file}")

Classification results saved to result.txt
