In [7]:
import torch
import numpy as np
import pandas as pd
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from tqdm import tqdm
from google.colab import drive
from PIL import Image

drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/Image and Video recognition/dataset
root = "/content/drive/MyDrive/Colab Notebooks/Image and Video recognition/dataset"

class CustomDatasetFromCSV(Dataset):
    def __init__(self, csv_path, transform = None):
        self.data = pd.read_csv(csv_path)
        self.contents = np.asarray(self.data['content'])
        self.fonts = np.asarray(self.data['font'])
        self.authors = np.asarray(self.data['author'])
        self.len = len(self.contents)
        self.images = np.asarray([ np.array(Image.open(self.data['word_path'][i])) for i in tqdm(range(self.len)) ])

    def __getitem__(self, index):
        image = self.images[index]
        content = self.contents[index]
        font = self.fonts[index]
        author = self.authors[index]
        return image, content, font, author

    def __len__(self):
        return self.len

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Image and Video recognition/dataset


In [11]:
def split_dataset(dataset, batch_size, split_size, method = "all", shuffle_dataset = True):
  if method == "all":
    indices = list(range(len(dataset)))
    if shuffle_dataset :
        np.random.seed(0)
        np.random.shuffle(indices)

    split_1 = int(np.floor(split_size[0] * len(dataset)))
    split_2 = int(np.floor((split_size[0] + split_size[1]) * len(dataset)))
    train_indices, val_indices, test_indices = indices[:split_1], indices[split_1:split_2], indices[split_2:]
  else:
    print("wrong method!!")

  # Creating PT data samplers and loaders:
  train_sampler = SubsetRandomSampler(train_indices)
  valid_sampler = SubsetRandomSampler(val_indices)
  test_sampler = SubsetRandomSampler(test_indices)
  train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
  validation_loader = DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler)
  test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)

  return train_loader, validation_loader, test_loader

In [6]:
annotated_file = "annotated_merged_deleted_x_2.csv"
dataset = CustomDatasetFromCSV(annotated_file)

100%|██████████| 2896/2896 [12:45<00:00,  3.78it/s]


In [17]:
# hyper-parameters
batch_size = 16
split_size = [0.8, 0.1, 0.1]
num_epochs = 10

train_set, valid_set, test_set = split_dataset(dataset, batch_size, split_size, method = "all", shuffle_dataset = True)

# Usage Example:
for epoch in range(num_epochs):
    # Train:   
    for batch_index, (images, contents, fonts, authors) in enumerate(train_set):
      pass

train_indices [2309, 1323, 22, 1005, 248, 1009, 2706, 2181, 841, 1364, 436, 2119, 454, 2106, 806, 1038, 600, 2325, 311, 1242, 615, 1225, 933, 1086, 302, 352, 2235, 2019, 118, 306, 1752, 2725, 2850, 117, 2452, 2874, 2060, 2045, 643, 138, 692, 2225, 2534, 1801, 1964, 1573, 2877, 1083, 719, 547, 641, 1902, 333, 2025, 410, 496, 276, 2581, 396, 1696, 1949, 998, 918, 2286, 2771, 2894, 2718, 2112, 2842, 33, 1114, 1419, 104, 1875, 1647, 1290, 840, 1451, 543, 1540, 701, 2664, 2170, 961, 92, 227, 906, 1180, 1767, 2462, 1148, 1486, 2507, 418, 2787, 2695, 648, 1661, 2779, 1824, 1062, 2585, 217, 501, 2414, 831, 1519, 982, 1330, 2798, 2074, 1387, 1280, 599, 1418, 1779, 2677, 898, 380, 1157, 661, 182, 2649, 2686, 2252, 1074, 1256, 399, 905, 2626, 2622, 2551, 2205, 224, 672, 1768, 2744, 1063, 762, 2508, 868, 2189, 2410, 1450, 465, 1927, 195, 10, 2295, 1632, 2557, 566, 1753, 983, 2559, 2399, 951, 2600, 2542, 1628, 1961, 70, 253, 621, 298, 15, 820, 674, 252, 1785, 2450, 2161, 1237, 565, 2249, 923, 2749,