In [None]:
# Preprocessing the data into datasets! I have labelled all images and put them into separate folders (gasfles or 'other')

In [1]:
!pip install transformers "datasets>=1.17.0" tensorboard --upgrade
!pip install transformers[torch]
!pip3 install torch torchvision
!pip install scikit-learn
!pip install accelerate -U
!pip install seaborn
!pip install pandas

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/62/c0/810e741a6244c0f004be40ccb96486d072f042eabbd4d7e8aa02b81ca1eb/transformers-4.44.0-py3-none-any.whl.metadata
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=1.17.0
  Obtaining dependency information for datasets>=1.17.0 from https://files.pythonhosted.org/packages/72/b3/33c4ad44fa020e3757e9b2fad8a5de53d9079b501e6bbc45bdd18f82f893/datasets-2.21.0-py3-none-any.whl.metadata
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting tensorboard
  Obtaining dependency information for tensorboard from https://files.pythonhosted.org/packages/d4/41/dccba8c5f955bc35b6110ff78574e4e5c8226ad62f08e732096c3861309b/tensorboard-2.17.1-py3-none-any.whl.metadata
  Downloading tensorboard-2.17.1

In [2]:
# Since there are often multiple frames of the same object, I dont want those different frames to end up in both test and trainset
import re
import os
import pandas as pd
def listdir_nohidden(path):
    return [f for f in os.listdir(path) if not f.startswith('.')]
    
    
def extract_timestamp(filename):
    # Regex to extract timestamp from the filename format
    match = re.search(r'vlcsnap-(\d+)-(\d+)-(\d+)-(\d+)h(\d+)m(\d+)s(\d+)', filename)
    if match:
        year, month, day, hour, minute, second, millisecond = map(int, match.groups())
        # Convert to total seconds (or any consistent time unit)
        timestamp = (((((year * 12 + month) * 30 + day) * 24 + hour) * 60 + minute) * 60 + second) * 1000 + millisecond
        return timestamp
    else:
        print("hey")

# Define your dataset path
dataset_path = 'c133'
classes = ['gasfles', 'other']

# Load image paths and timestamps
image_data = []

for class_name in classes:
    class_path = os.path.join(dataset_path, class_name)
    for filename in os.listdir(class_path):
        if filename.endswith('.png'):
            timestamp = extract_timestamp(filename)
            image_data.append((os.path.join(class_path, filename), timestamp, class_name))

# Create a DataFrame
df = pd.DataFrame(image_data, columns=['filepath', 'timestamp', 'class'])

# Sort by timestamp
df = df.sort_values(by='timestamp')

In [3]:
df

Unnamed: 0,filepath,timestamp,class
157,c133/gasfles/vlcsnap-2024-08-09-14h32m40s183.png,62976061960183,gasfles
462,c133/other/vlcsnap-2024-08-13-12h07m37s979.png,62976398857979,other
218,c133/gasfles/vlcsnap-2024-08-13-12h07m52s660.png,62976398872660,gasfles
225,c133/gasfles/vlcsnap-2024-08-13-12h07m53s616.png,62976398873616,gasfles
14,c133/gasfles/vlcsnap-2024-08-13-12h07m54s892.png,62976398874892,gasfles
...,...,...,...
77,c133/gasfles/vlcsnap-2024-08-16-00h20m30s382.png,62976615630382,gasfles
272,c133/gasfles/vlcsnap-2024-08-16-00h20m30s766.png,62976615630766,gasfles
106,c133/gasfles/vlcsnap-2024-08-16-00h20m31s094.png,62976615631094,gasfles
287,c133/gasfles/vlcsnap-2024-08-16-00h20m31s416.png,62976615631416,gasfles


In [17]:
def create_splits(df, interval=70000, test_ratio=0.2):
    # Calculate the number of test samples based on the ratio
    total_samples = len(df)
    num_test_samples = int(total_samples * test_ratio)
    
    train_files = []
    test_files = []
    last_test_time = -interval  
    
    for idx, row in df.iterrows():
        if len(test_files) < num_test_samples:
            if row['timestamp'] >= last_test_time + interval:
                # Assign to test set
                test_files.append((row['filepath'], row['class']))
                last_test_time = row['timestamp']
            elif row['timestamp'] < last_test_time + interval:
                # If within the interval since the last test entry, also add to test set
                test_files.append((row['filepath'], row['class']))
            else:
                # Assign to train set if not already in test
                train_files.append((row['filepath'], row['class']))
        else:
            # Assign remaining data to train set once test set size is met
            train_files.append((row['filepath'], row['class']))
            
    return train_files, test_files
train_files, test_files = create_splits(df)

In [18]:
train_files

[('c133/gasfles/vlcsnap-2024-08-13-14h49m49s336.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-14h49m49s726.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-14h49m50s095.png', 'gasfles'),
 ('c133/other/vlcsnap-2024-08-13-14h49m52s523.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-14h49m53s624.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-14h49m55s028.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-14h49m56s317.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-14h49m57s562.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-14h49m58s394.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-14h49m59s016.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-14h49m59s863.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-15h01m33s795.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-15h01m34s447.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-15h01m34s858.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-15h01m35s406.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-15h01m36s051.png', 'other

In [19]:
test_files

[('c133/gasfles/vlcsnap-2024-08-09-14h32m40s183.png', 'gasfles'),
 ('c133/other/vlcsnap-2024-08-13-12h07m37s979.png', 'other'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m52s660.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m53s616.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m54s892.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m55s696.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m56s248.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m57s138.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m57s854.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m58s690.png', 'gasfles'),
 ('c133/gasfles/vlcsnap-2024-08-13-12h07m59s412.png', 'gasfles'),
 ('c133/other/vlcsnap-2024-08-13-12h10m36s796.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-12h10m37s312.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-12h10m38s049.png', 'other'),
 ('c133/other/vlcsnap-2024-08-13-12h10m48s220.png', 'other'),
 ('c133/other/vlcsnap-2024-08-

In [21]:
#move to new folders
import shutil
for file, class_name in test_files:
    path_split = file.split("/")[-1]
    shutil.copy(file, f"c133/test/{class_name}/{path_split}")

for file, class_name in train_files:
    path_split = file.split("/")[-1]
    shutil.copy(file, f"c133/train/{class_name}/{path_split}")