In [1]:
import os
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import pandas as pd
from tqdm.auto import tqdm
from torch import amp
import numpy as np
import matplotlib.pyplot as plt
from torchvision.utils import make_grid
from sklearn.metrics import f1_score
import torch.nn.functional as F
from itertools import cycle
import gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt)**self.gamma * bce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

In [3]:
##############################################################################
# 1. DATA PREPARATION
##############################################################################
class ImageDataset(Dataset):
    def __init__(self, data, root_dir, transform=None):
        """
        Args:
            data (DataFrame): Contains 'downloadUrl' and 'is_conifer'
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
        """
        self._data = []
        for i, row in data.dropna().iterrows():
            url = row['downloadUrl']
            filename = url.split('/')[-1]
            is_conifer = row.get('is_conifer')
            self._data.append((filename, is_conifer))
        self._root_dir = root_dir
        self._transform = transform

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        filename, label = self._data[idx]
        img_name = os.path.join(self._root_dir, filename)
        image = Image.open(img_name).convert("RGB")
        if self._transform:
            image = self._transform(image)
        label = float(label)
        return image, label

class ImageDatasetWithConfidence(Dataset):
    def __init__(self, data, root_dir, transform=None):
        self._data = []
        for i, row in data.dropna().iterrows():
            url = row['downloadUrl']
            filename = url.split('/')[-1]
            label = row.get('is_conifer')
            confidence = row.get('confidence_normalized', 1.0)  # 1.0 if not available
            self._data.append((filename, label, confidence))
        self._root_dir = root_dir
        self._transform = transform

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        filename, label, confidence = self._data[idx]
        img_name = os.path.join(self._root_dir, filename)
        image = Image.open(img_name).convert("RGB")

        if self._transform:
            image = self._transform(image)

        return image, label, confidence

class FixMatchUnlabeledDataset(Dataset):
    def __init__(self, data, root_dir, weak_transform=None, strong_transform=None):
        self._data = data.reset_index(drop=True)
        self._root_dir = root_dir
        self.weak_transform = weak_transform
        self.strong_transform = strong_transform

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        record = self._data.iloc[idx]
        filename = record['downloadUrl'].split('/')[-1]
        img_name = os.path.join(self._root_dir, filename)
        image = Image.open(img_name).convert("RGB")

        if self.weak_transform:
            weak_image = self.weak_transform(image)
        else:
            weak_image = transforms.ToTensor()(image)

        if self.strong_transform:
            strong_image = self.strong_transform(image)
        else:
            strong_image = transforms.ToTensor()(image)

        return weak_image, strong_image

class FixMatchLabeledDataset(Dataset):
    def __init__(self, data, root_dir, transform=None):
        super().__init__()
        self.transform = transform
        self.root_dir = root_dir
        self.records = data.reset_index(drop=True)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        row = self.records.iloc[idx]
        filename = row['downloadUrl'].split('/')[-1]
        label = float(row['is_conifer'])
        conf = row.get('confidence_normalized', 1.0) ** 2
        img_path = os.path.join(self.root_dir, filename)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label, conf

In [4]:
def calculate_mean_std(loader):
    """Calculate mean and standard deviation for a dataset."""
    total_sum = torch.zeros(3)
    total_squared_sum = torch.zeros(3)
    total_count = 0

    for images, *_ in loader:
        # images: (batch_size, 3, H, W)
        images = images.view(images.size(0), images.size(1), -1)
        total_sum += images.sum(dim=(0, 2))
        total_squared_sum += (images ** 2).sum(dim=(0, 2))
        total_count += images.size(0) * images.size(2)

    mean = total_sum / total_count
    std = (total_squared_sum / total_count - mean ** 2).sqrt()
    return mean, std


labeled = pd.read_csv('hw_3_markup_data.txt', sep='\t')  # 200 images
unlabeled = pd.read_csv('hw_3_no_markup_data.txt', sep='\t', dtype=str)
crowdsourced_df = pd.read_csv('train_full.tsv', sep='\t', skiprows=1,
                              names=['downloadUrl', 'is_conifer', 'confidence'], usecols=range(3))
crowdsourced_df = crowdsourced_df[~crowdsourced_df['downloadUrl'].isin(labeled['downloadUrl'])]

unlabeled = unlabeled[~unlabeled['downloadUrl'].isin(labeled['downloadUrl'])]
unlabeled = unlabeled[~unlabeled['downloadUrl'].isin(crowdsourced_df['downloadUrl'])]

temp_dataset = ImageDataset(
    crowdsourced_df[crowdsourced_df['confidence'] == '100.00%'], # calculate mean without validation
    'unlabeled',
    transforms.Compose([
        transforms.Resize((224, 224)), 
        transforms.ToTensor()
    ])
)

temp_loader = DataLoader(temp_dataset, batch_size=64, shuffle=True)
train_mean, train_std = calculate_mean_std(temp_loader)
del temp_dataset, temp_loader
gc.collect()
torch.cuda.empty_cache()

print("Calculated mean:", train_mean)
print("Calculated std:", train_std)

##############################################################################
# 2. TRANSFORMS FOR FIXMATCH
##############################################################################
#   - labeled_transform
#   - weak_transform
#   - strong_transform

from torchvision.transforms import RandAugment

transform = transforms.Compose([
    transforms.Resize(size=(224, 224)), #224
    transforms.ToTensor(),
    transforms.Normalize(train_mean, train_std)
])

# For Labeled Data
labeled_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(train_mean, train_std),
])

# For Unlabeled Data
weak_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(train_mean, train_std),
])

strong_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    RandAugment(num_ops=2, magnitude=10),
    transforms.ToTensor(),
    transforms.Normalize(train_mean, train_std),
])

##############################################################################
# 3. BUILD FIXMATCH DATALOADERS
##############################################################################

# FILTER CROWD IMAGES BASED ON CONFIDENCE
#    - training set: 0.9 <= conf < 1
#    - val set: conf = 1
#    - unlabeled set: conf < 0.9

crowdsourced_df['confidence'] = crowdsourced_df['confidence'].apply(lambda x: float(x[:-1]))
max_conf = crowdsourced_df['confidence'].max()
crowdsourced_df['confidence_normalized'] = crowdsourced_df['confidence'] / max_conf

cs_threshold = 0.9

cs_val = crowdsourced_df[crowdsourced_df['confidence_normalized'] == 1.0]
cs_train = crowdsourced_df[(crowdsourced_df['confidence_normalized'] >= cs_threshold) &
                           (crowdsourced_df['confidence_normalized'] < 1.0)]
cs_unlabeled = crowdsourced_df[crowdsourced_df['confidence_normalized'] < cs_threshold]

# cs_labeled = crowdsourced_df[crowdsourced_df['confidence_normalized'] >= cs_threshold]
# cs_unlabeled = crowdsourced_df[crowdsourced_df['confidence_normalized'] < cs_threshold]

combined_labeled_df = pd.concat([cs_train], ignore_index=True)

unlabeled_big_df = pd.concat([unlabeled, cs_unlabeled], ignore_index=True)

fixmatch_labeled_dataset = FixMatchLabeledDataset(
    data=combined_labeled_df,
    root_dir='unlabeled',
    transform=labeled_transform
)

fixmatch_unlabeled_dataset = FixMatchUnlabeledDataset(
    data=unlabeled_big_df,
    root_dir='unlabeled', 
    weak_transform=weak_transform,
    strong_transform=strong_transform
)

# Build Dataloaders
# Typically, we want a ratio of unlabeled to labeled like 4:1 or 5:1 in each batch

labeled_batch_size = 48
unlabeled_batch_size = 190

labeled_loader = DataLoader(fixmatch_labeled_dataset, batch_size=labeled_batch_size, shuffle=True, num_workers=8)
unlabeled_loader = DataLoader(fixmatch_unlabeled_dataset, batch_size=unlabeled_batch_size, shuffle=True, num_workers=8)

val_df = pd.concat([labeled, cs_val], ignore_index=True)
val_dataset = ImageDataset(val_df, 'labeled', transform)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=8)

##############################################################################
# 4. MODEL DEFINITION
##############################################################################
model = models.efficientnet_v2_s(weights="EfficientNet_V2_S_Weights.IMAGENET1K_V1")
num_in_features_for_class_head = model.classifier[1].in_features

model.classifier = nn.Sequential(
    nn.Linear(num_in_features_for_class_head, 512), 
    nn.GELU(),
    nn.Dropout(0.1),
    nn.Linear(512, 1)
)

for features in model.features[:-3]:
    for param in features.parameters():
        param.requires_grad = False

model = model.to(device)

num_of_trained_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_of_all_params = sum(p.numel() for p in model.parameters())
print(f"The total number of params: {num_of_all_params}")
print(f"The number of trainable params: {num_of_trained_params}")
print(f"The % of trainable params of total params: {(num_of_trained_params / num_of_all_params * 100):.2f}%")

##############################################################################
# 5. LOSS, OPTIMIZER, and LR SCHEDULER
##############################################################################
unsup_criterion = nn.BCEWithLogitsLoss()
focal_criterion = FocalLoss(alpha=0.31, gamma=2.0, reduction='mean')
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
scaler = amp.GradScaler(enabled=(device.type=='cuda'))

num_epochs = 100
accumulation_steps = 2

warmup_epochs = int(0.2 * num_epochs)
# scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=warmup_lr_lambda)

def warmup_lr_lambda(epoch):
    if epoch < warmup_epochs:
        return float(epoch + 1) / float(warmup_epochs)
    else:
        return 1.0

scheduler_warmup = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=warmup_lr_lambda)

scheduler_plateau = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10)

##############################################################################
# 6. EARLY STOPPING CLASS
##############################################################################
class EarlyStopping:
    def __init__(self, patience=10, delta=0, verbose=False, path='model_path.pt', mode='max'):
        """
        Args:
            patience (int): How many epochs to wait after last improvement.
            delta (float): Minimum change to qualify as improvement.
            verbose (bool): If True, prints a message for each improvement.
            path (str): Path to save the best model.
            mode (str): 'min' or 'max'.
        """
        self.patience = patience
        self.delta = delta
        self.verbose = verbose
        self.path = path
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False

        if self.mode == 'min':
            self.best_score = np.Inf
            self.monitor_op = lambda current, best: current < best - self.delta
        elif self.mode == 'max':
            self.best_score = -np.Inf
            self.monitor_op = lambda current, best: current > best + self.delta
        else:
            raise ValueError("mode must be 'min' or 'max'")

    def __call__(self, metric, model):
        score = metric

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(model)
        elif self.monitor_op(score, self.best_score):
            self.best_score = score
            self.save_checkpoint(model)
            self.counter = 0
        else:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, model):
        torch.save(model.state_dict(), self.path)
        if self.verbose:
            print(f'Validation metric improved. Saving model to {self.path}')

early_stopping = EarlyStopping(patience=50, verbose=True, path='best_fixmatch_model.pt', mode='max')

Calculated mean: tensor([0.2654, 0.3301, 0.2542])
Calculated std: tensor([0.1729, 0.1556, 0.1324])
The total number of params: 20833873
The number of trainable params: 19012297
The % of trainable params of total params: 91.26%


In [5]:
##############################################################################
# 7. FIXMATCH TRAINING LOOP
##############################################################################
threshold = 0.95     # confidence threshold to accept pseudo-label
lambda_u = 1.0       # weight for unlabeled loss

def train_one_epoch_fixmatch(
    epoch, 
    model, 
    labeled_loader, 
    unlabeled_loader, 
    focal_criterion,
    unsup_criterion,
    optimizer, 
    scaler, 
    accumulation_steps
):
    model.train()
    running_loss = 0.0

    # NOTE: size(label) == size(unlab)
    # max_batches = max(len(labeled_loader), len(unlabeled_loader)) No enough memory :(
    # max_batches = 60
    max_batches = min(len(labeled_loader), len(unlabeled_loader))
    labeled_iter = iter(cycle(labeled_loader))
    unlabeled_iter = iter(cycle(unlabeled_loader))

    for batch_idx in range(max_batches):
        # (x_labeled, y_labeled) = next(labeled_iter)
        (x_labeled, y_labeled, conf_labeled) = next(labeled_iter)
        

        x_labeled = x_labeled.to(device)
        y_labeled = y_labeled.to(device)
        conf_labeled = conf_labeled.to(device)

        (x_weak, x_strong) = next(unlabeled_iter)
        x_weak = x_weak.to(device)
        x_strong = x_strong.to(device)

        # ---------------------------
        # 1. SUPERVISED LOSS
        # ---------------------------
        with amp.autocast('cuda'):
            labeled_logits = model(x_labeled).squeeze(1)
            raw_loss = focal_criterion(labeled_logits, y_labeled)
            sup_loss = (raw_loss * conf_labeled).mean()

        # ---------------------------
        # 2. UNSUPERVISED LOSS
        # ---------------------------
        with torch.no_grad():
            weak_logits = model(x_weak).squeeze(1)
            weak_probs = torch.sigmoid(weak_logits)
            pseudo_label = (weak_probs > 0.5).float()
            mask = (weak_probs > threshold).float()

        with amp.autocast('cuda'):
            strong_logits = model(x_strong).squeeze(1)
            unsup_loss = unsup_criterion(strong_logits, pseudo_label)
            unsup_loss = (unsup_loss * mask).mean()

            total_loss = sup_loss + lambda_u * unsup_loss

        scaler.scale(total_loss / accumulation_steps).backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        running_loss += total_loss.item()

        if (batch_idx + 1) % 10 == 0:
            print(f"[Epoch {epoch+1}, Batch {batch_idx+1}] SupLoss: {sup_loss.item():.4f}, UnsupLoss: {unsup_loss.item():.4f}, TotalLoss: {total_loss.item():.4f}")


    if max_batches % accumulation_steps != 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

    # scheduler.step()
    avg_loss = running_loss / max_batches
    print(f">>> [Epoch {epoch+1}] FixMatch Train Loss: {avg_loss:.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}")


def validate_fixmatch(model, val_loader):
    model.eval()
    total = 0
    correct = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            logits = model(x_val).squeeze(1)
            probs = torch.sigmoid(logits)

            predicted = (probs > 0.5).float()
            total += y_val.size(0)
            correct += (predicted == y_val).sum().item()

            all_labels.extend(y_val.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    acc = 100.0 * correct / total
    f1 = f1_score(all_labels, all_preds, pos_label=1)
    print(f"Validation Accuracy: {acc:.2f}%, F1-Score: {f1:.4f}")
    return f1

In [6]:
##############################################################################
# 8. RUN THE TRAINING
##############################################################################
best_f1 = -np.inf

for epoch in tqdm(range(num_epochs)):
    train_one_epoch_fixmatch(
        epoch, 
        model, 
        labeled_loader, 
        unlabeled_loader, 
        focal_criterion,
        unsup_criterion,
        optimizer, 
        scaler, 
        accumulation_steps
    )

    val_f1 = validate_fixmatch(model, val_loader)

    if epoch < warmup_epochs + 10:
        scheduler_warmup.step()
    else:
        scheduler_plateau.step(val_f1)

    early_stopping(val_f1, model)
    if early_stopping.early_stop:
        print("Early stopping triggered!")
        break

  0%|          | 0/100 [00:00<?, ?it/s]

[Epoch 1, Batch 10] SupLoss: 0.0511, UnsupLoss: 0.0000, TotalLoss: 0.0511
[Epoch 1, Batch 20] SupLoss: 0.0506, UnsupLoss: 0.0000, TotalLoss: 0.0506
>>> [Epoch 1] FixMatch Train Loss: 0.0503, LR: 0.000005
Validation Accuracy: 75.47%, F1-Score: 0.1333
Validation metric improved. Saving model to best_fixmatch_model.pt
[Epoch 2, Batch 10] SupLoss: 0.0506, UnsupLoss: 0.0000, TotalLoss: 0.0506
[Epoch 2, Batch 20] SupLoss: 0.0478, UnsupLoss: 0.0000, TotalLoss: 0.0478
>>> [Epoch 2] FixMatch Train Loss: 0.0486, LR: 0.000010
Validation Accuracy: 77.36%, F1-Score: 0.1429
Validation metric improved. Saving model to best_fixmatch_model.pt
[Epoch 3, Batch 10] SupLoss: 0.0434, UnsupLoss: 0.0000, TotalLoss: 0.0434
[Epoch 3, Batch 20] SupLoss: 0.0433, UnsupLoss: 0.0000, TotalLoss: 0.0433
>>> [Epoch 3] FixMatch Train Loss: 0.0469, LR: 0.000015
Validation Accuracy: 77.36%, F1-Score: 0.1429
EarlyStopping counter: 1 out of 50
[Epoch 4, Batch 10] SupLoss: 0.0461, UnsupLoss: 0.0000, TotalLoss: 0.0461
[Epoch 

In [25]:
model.load_state_dict(torch.load('best_fixmatch_model_0_90.pt'))
print("Loaded the best model from early stopping.")

  model.load_state_dict(torch.load('best_fixmatch_model_0_90.pt'))


Loaded the best model from early stopping.


In [18]:
def load_test_data(path):
    data = pd.read_excel(path)
    return data

test_df = load_test_data('test_set_for_labeling.xlsx')
test_df.head(10)

Unnamed: 0,downloadUrl
0,https://new-projects-team-public.s3.yandex.net...
1,https://new-projects-team-public.s3.yandex.net...
2,https://new-projects-team-public.s3.yandex.net...
3,https://new-projects-team-public.s3.yandex.net...
4,https://new-projects-team-public.s3.yandex.net...
5,https://new-projects-team-public.s3.yandex.net...
6,https://new-projects-team-public.s3.yandex.net...
7,https://new-projects-team-public.s3.yandex.net...
8,https://new-projects-team-public.s3.yandex.net...
9,https://new-projects-team-public.s3.yandex.net...


In [19]:
# download_images_from_csv("test_set_for_labeling.xlsx", "downloadUrl", "test_data")

In [20]:
class TreeTestDataset(Dataset):
    def __init__(self, dataframe, data_dir, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.data_dir = data_dir

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.data_dir, self.dataframe.iloc[idx, 0].split('/')[-1])
        image = Image.open(img_name).convert("RGB")
        # print(image.shape, image.dtype)
        # label = self.dataframe.iloc[idx, 0]
        # print(type(label))
        # print(label, img_path)
        # image = sr_model(image.to(device))[0]
        if self.transform:
            image = self.transform(image)

        return image, self.dataframe.iloc[idx, 0]

In [26]:
transform = transforms.Compose([
    transforms.Resize(size=(224, 224)), #224
    transforms.ToTensor(),
    transforms.Normalize(train_mean, train_std)
])

In [27]:
val_dataset = ImageDataset(val_df, 'labeled', transform)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=8)
validate_fixmatch(model, val_loader)

Validation Accuracy: 94.34%, F1-Score: 0.8571


0.8571428571428571

In [31]:
val_dataset = ImageDataset(labeled, 'labeled', transform)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=8)
validate_fixmatch(model, val_loader)

Validation Accuracy: 83.50%, F1-Score: 0.8325


0.8324873096446701

In [28]:
test_dataset = TreeTestDataset(test_df, data_dir="test_data", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, pin_memory=True, pin_memory_device='cuda')
predictions = []
img_paths = []
for inputs, paths in tqdm(test_loader):
    inputs = inputs.to(device)
    with torch.set_grad_enabled(False):
        outputs = model(inputs)
        preds = torch.argmax(torch.sigmoid(outputs), dim=-1).cpu().numpy()
        predictions.extend(preds)
        img_paths.extend(paths)

  0%|          | 0/88 [00:00<?, ?it/s]

In [29]:
header = "downloadUrl,is_conifer\n"
text_data = header + '\n'.join([f"{url},{str(bool(label)).upper()}" for url, label in zip(img_paths, predictions)])
# text_data
with open("test_result_labels.txt", 'w') as f:
    f.write(text_data)

__Analyze__

In [None]:
import requests
from torchvision.io import read_image
def download_image(url, save_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(save_path, 'wb') as out_file:
            out_file.write(response.content)
    except Exception as e:
        print(f"Error downloading {url}: {e}")


def download_images_from_csv(csv_file, image_column, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    if csv_file.endswith(".xlsx"):
        df = pd.read_excel(csv_file)
    else:
        df = pd.read_csv(csv_file, sep="\t")

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        url = row[image_column]
        filename = os.path.join(output_dir, url.split('/')[-1])
        download_image(url, filename)

In [12]:
crowdsourced_df = pd.read_csv('train_full.tsv', sep='\t', skiprows=1,
                              names=['downloadUrl', 'is_conifer', 'confidence'], usecols=range(3))
crowdsourced_df = crowdsourced_df[~crowdsourced_df['downloadUrl'].isin(labeled['downloadUrl'])]
crowdsourced_df['confidence'] = crowdsourced_df['confidence'].apply(lambda x: float(x[:-1]))
max_conf = crowdsourced_df['confidence'].max()
crowdsourced_df['confidence_normalized'] = crowdsourced_df['confidence'] / max_conf

In [13]:
crowdsourced_df['confidence'].min()

50.09

In [14]:
crowdsourced_df[crowdsourced_df['confidence_normalized'] >= 0.9].shape

(1337, 4)

In [20]:
# crowdsourced_df[crowdsourced_df['confidence_normalized'] == 1].downloadUrl.to_csv('test.csv')

In [4]:
import shutil

image_names = pd.read_csv('test.csv', header=None)[0]

source_folder = 'unlabeled'
destination_folder = 'labeled'

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

for image_name in image_names:
    source_path = os.path.join(source_folder, image_name)
    destination_path = os.path.join(destination_folder, image_name)

    try:
        # shutil.copy2(source_path, destination_path)
        
        image = Image.open(destination_path)
        tensor_image = transform(image)

        # plt.imshow(tensor_image.permute(1, 2, 0))
        # plt.title(f"Image: {image_name}")
        # plt.axis('off')
        # plt.show()

    except FileNotFoundError:
        print(f"File not found: {image_name}")

In [9]:
combined_labeled_df.is_conifer.value_counts()

is_conifer
False    886
True     398
Name: count, dtype: int64