0. Set up notebook to start from repo root

In [1]:
import os
%cd C:\Users\benjo\Documents\Projects\wildlife-camtrap-classification

C:\Users\benjo\Documents\Projects\wildlife-camtrap-classification


1. Data loading and making of manifest
Extracts relevant data from the annotations json and converts and saves into a csv for future usage

In [2]:
images = r"data/cct/images"
annotations = r"data/cct/annotations/caltech_images_20210113.json"
manifest_out = r"data/cct/manifest.csv"
counts_out = r"data/cct/class_counts.csv"
missing_out = r"data/cct/missing_files.txt"

print("cwd:", os.getcwd())

cwd: C:\Users\benjo\Documents\Projects\wildlife-camtrap-classification


In [3]:
from src.data.make_manifest import load_coco, make_manifest_rows, write_manifest, write_counts

coco = load_coco(annotations)
rows,missing = make_manifest_rows(images,coco)
write_manifest(rows,manifest_out)
write_counts(rows,counts_out)
if missing:
    with open(missing_out, "w", encoding="utf-8") as f:
        f.write("\n".join(missing))
print(f'Manifest: {manifest_out}, Rows:{len(rows)}')
print(f'Class Counts : {counts_out}')
if missing:
    print(f'Missing Files: {missing_out}')


Manifest: data/cct/manifest.csv, Rows:243100
Class Counts : data/cct/class_counts.csv


2. Data Preperation

In [4]:
from src.data.split_dataset import load_manifest,split_df,summarise_split, save_splits

df = load_manifest(manifest_out)
df.head()
missing_per =(df.isna().sum() / len(df)) * 100
print(missing_per)

empty = df['label'].astype(str).str.strip() == ''
print(empty.sum())

rel_path    0.0
label       0.0
location    0.0
dtype: float64
0


From above cells can see in the dataset no missing label,image or path data which would impede training of model, so no intial changes necessary 

In [13]:
import pandas as pd
from src.data.split_dataset import make_subset

df = df[~df['label'].isin(['insect','bat','pig'])]
#AS im using reduced subset need to remove smallest classes present to avoid errors
train,val,test = split_df(df,0.8)


train = train.copy()
val =val.copy()
test = test.copy()



splits_summary = pd.DataFrame([
    summarise_split(train,'Train'),
    summarise_split(test,'Test'),
    summarise_split(val,'Val')
])



splits_summary

Unnamed: 0,split,rows,cols,n_classes,n_groups,empty_count,empty_pct
0,Train,193082,3,19,109,107079,55.46
1,Test,22944,3,15,13,9160,39.92
2,Val,27063,3,17,18,9506,35.13


Had to create sub-set to allow for training on local machine. Also removed smaller classes to avoid getting errors if only 1 of a specific class in a split

In [14]:
train = make_subset(train,0.15)
val =make_subset(val,0.15)
test = make_subset(test,0.15) 

splits_summary = pd.DataFrame([
    summarise_split(train,'Train'),
    summarise_split(test,'Test'),
    summarise_split(val,'Val')
])


save_splits(train,val,test)
splits_summary


Unnamed: 0,split,rows,cols,n_classes,n_groups,empty_count,empty_pct
0,Train,28962,3,19,109,16062,55.46
1,Test,3441,3,15,13,1374,39.93
2,Val,4059,3,16,18,1426,35.13


Making of Label Map from str to int

In [15]:
from src.data.labels import counts_to_ids

label_map = counts_to_ids(
    counts_csv="data/cct/class_counts.csv",
    out_json="data/cct/labels.json"
)

Model intialisation and dataset creation

In [16]:
from src.models.model import create_model
from src.data.dataset import create_dataloader, CCTImageDataset, transform_images

train_transform = transform_images(train = True,size = 224)
test_val_t = transform_images(train = False,size = 22)
img_dir = 'data/cct/images'

train_dataset = CCTImageDataset(train,img_dir,label_map,transform= train_transform)
test_dataset = CCTImageDataset(test,img_dir,label_map,transform= test_val_t)
val_dataset = CCTImageDataset(val,img_dir,label_map,transform= test_val_t)

BATCH_size = 32
train_loader = create_dataloader(dataset= train_dataset,batch_size=BATCH_size,shuffle=True)
val_loader = create_dataloader(dataset=val_dataset,batch_size=BATCH_size,shuffle=False)
test_loader = create_dataloader(dataset=test_dataset,batch_size=BATCH_size,shuffle=False)

model = create_model(labels_path="data/cct/labels.json")

In [17]:
images, labels = next(iter(train_loader))
print(f"Images batch shape: {images.shape}")
print(f"Labels batch shape: {labels.shape}")
print(f"Image tensor dtype: {images.dtype}")
print(f"Label tensor dtype: {labels.dtype}")

print(f"Image tensor min value: {images.min()}")
print(f"Image tensor max value: {images.max()}")

print(f"First 5 labels: {labels[:5]}")

#Checks output of dataloader to ensure it looks approx correct.


Images batch shape: torch.Size([32, 3, 224, 224])
Labels batch shape: torch.Size([32])
Image tensor dtype: torch.float32
Label tensor dtype: torch.int64
Image tensor min value: -2.1179039478302
Image tensor max value: 2.640000104904175
First 5 labels: tensor([0, 9, 8, 0, 0])


3. Running of Model. Train/Val

In [None]:
from src.train import run,setup_loss_log
setup_loss_log('reports/performance_log.csv')

run(train_loader,val_loader,model,epochs=6,lr = 1e-3,patience= 3,log_path = 'reports/performance_log.csv')