In [14]:
import pandas as pd
import joblib
import os
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelBinarizer

In [15]:
datasetPath = 'data'

In [16]:
# get all the image folder paths
all_paths = os.listdir(datasetPath)
folder_paths = [path for path in all_paths if os.path.isdir(datasetPath + '/' + path)]
print(f"Folder paths: {folder_paths}")
print(f"Number of folders: {len(folder_paths)}")

Folder paths: ['.ipynb_checkpoints', 'badminton', 'baseball', 'basketball', 'boxing', 'chess', 'cricket', 'fencing', 'football', 'formula1', 'gymnastics', 'hockey', 'ice_hockey', 'kabaddi', 'models', 'motogp', 'shooting', 'swimming', 'table_tennis', 'tennis', 'volleyball', 'weight_lifting', 'wrestling', 'wwe']
Number of folders: 24


In [17]:
# we will create the data for the following labels,
# add more to list to use those for creating the data as well
create_labels = ['basketball', 'boxing', 'chess']
# create a DataFrame
data = pd.DataFrame()

In [20]:
image_formats = ['jpg', 'JPG', 'PNG', 'png'] # we only want images that are in this format
labels = []
counter = 0
for i, folder_path in tqdm(enumerate(folder_paths), total=len(folder_paths)):
    if folder_path not in create_labels:
        continue
    image_paths = os.listdir(datasetPath + '/' + folder_path)
    label = folder_path
    # save image paths in the DataFrame
    for image_path in image_paths:
        if image_path.split('.')[-1] in image_formats:
            data.loc[counter, 'image_path'] = f"{datasetPath}/{folder_path}/{image_path}"
            labels.append(label)
            counter += 1

100%|██████████| 24/24 [00:00<00:00, 27.65it/s]


In [21]:
labels = np.array(labels)
# one-hot encode the labels
lb = LabelBinarizer()
labels = lb.fit_transform(labels)

In [25]:
if len(labels[0]) == 1:
    for i in range(len(labels)):
        index = labels[i]
        data.loc[i, 'target'] = int(index)
elif len(labels[0]) > 1:
    for i in range(len(labels)):
        index = np.argmax(labels[i])
        data.loc[i, 'target'] = int(index)

In [27]:
# shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)
print(f"Number of labels or classes: {len(lb.classes_)}")
print(f"The first one hot encoded labels: {labels[0]}")
print(f"Mapping the first one hot encoded label to its category: {lb.classes_[0]}")
print(f"Total instances: {len(data)}")

# save as CSV file
data.to_csv(datasetPath + '.csv', index=False)

# pickle the binarized labels
print('Saving the binarized labels as pickled file')
joblib.dump(lb, 'outputs/lb.pkl')

print(data.head(5))

Number of labels or classes: 3
The first one hot encoded labels: [1 0 0]
Mapping the first one hot encoded label to its category: basketball
Total instances: 1592
Saving the binarized labels as pickled file
                 image_path  target
0  data/boxing/00000623.jpg     1.0
1   data/chess/00000394.jpg     2.0
2   data/chess/00000382.jpg     2.0
3  data/boxing/00000256.jpg     1.0
4  data/boxing/00000363.jpg     1.0


In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import joblib
# load the binarized labels file
lb = joblib.load('outputs/lb.pkl')
class CustomCNN(nn.Module):
    def __init__(self):
        super(CustomCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 5)
        self.conv2 = nn.Conv2d(16, 32, 5)
        self.conv3 = nn.Conv2d(32, 64, 3)
        self.conv4 = nn.Conv2d(64, 128, 5)
        self.fc1 = nn.Linear(128, 256)
        self.fc2 = nn.Linear(256, len(lb.classes_))
        self.pool = nn.MaxPool2d(2, 2)
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        bs, _, _, _ = x.shape
        x = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

ModuleNotFoundError: No module named 'torch'

In [None]:
import torch
import argparse
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import joblib
import albumentations
import torch.optim as optim
import os
import cnn_models
import matplotlib
import matplotlib.pyplot as plt
import time
import pandas as pd
matplotlib.style.use('ggplot')
from imutils import paths
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from PIL import Image