In [2]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import MobileNetV2
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator


# Fix random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
labeled_mapped_csv_path = Path.cwd() / "labeled_mapped.csv"

labels_to_ids = {
    "Educational": 0,
    "Entertainment": 1,
}

num_labels = len(labels_to_ids)

In [4]:
# Load data from CSV file
df = pd.read_csv(labeled_mapped_csv_path)

# Read header for column names
column_names = df.columns.tolist()
print("Column names:", column_names)
print("Dataframe shape:", df.shape)

# Print number of elements for each label
print()
print("Elements per label before making them equal:")
print(df['label'].value_counts())

# Only retain equal number of elements for each label
least_number_of_elements = df['label'].value_counts().min()
df = df.groupby('label').head(least_number_of_elements).reset_index(drop=True)

# Drop title
df = df.drop(['title'], axis=1)

# Print number of elements for each label
print()
print("Elements per label after making them equal:")
print(df['label'].value_counts())

Column names: ['label', 'title', 'path']
Dataframe shape: (2736, 3)

Elements per label before making them equal:
label
Entertainment    1825
Educational       911
Name: count, dtype: int64

Elements per label after making them equal:
label
Entertainment    911
Educational      911
Name: count, dtype: int64


In [5]:
print()
print("Dataframe shape:", df.shape)
print("Dataframe head:")
print(df.head())



Dataframe shape: (1822, 2)
Dataframe head:
           label   path
0  Entertainment  1.jpg
1  Entertainment  2.jpg
2  Entertainment  3.jpg
3  Entertainment  4.jpg
4    Educational  5.jpg


In [6]:
paths = df['path']
labels = df['label'] 

# Split the data into train, validation, and test sets
paths_temp, paths_test, labels_temp, labels_test = train_test_split(paths, labels, test_size=0.2, random_state=42)

# Split the temporary set into separate training and validation sets
paths_train, paths_val, labels_train, labels_val = train_test_split(paths_temp, labels_temp, test_size=0.25, random_state=42)

print("Training set size: ", len(paths_train))
print("Validation set size: ", len(paths_val))
print("Test set size: ", len(paths_test))

Training set size:  1092
Validation set size:  365
Test set size:  365


In [8]:
datagen = ImageDataGenerator(rescale=1./255.)

train_generator = datagen.flow_from_dataframe(
    dataframe=pd.DataFrame({'filename': paths_train, 'class': labels_train}),
    directory='./mapped_unlabeled_data/',
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=4,
    class_mode='categorical'
)

val_generator = datagen.flow_from_dataframe(
    dataframe=pd.DataFrame({'filename': paths_val, 'class': labels_val}),
    directory='./mapped_unlabeled_data/',
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=4,
    class_mode='categorical'
)

test_generator = datagen.flow_from_dataframe(
    dataframe=pd.DataFrame({'filename': paths_test, 'class': labels_test}),
    directory='./mapped_unlabeled_data/',
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=4,
    class_mode='categorical'
)

Found 1092 validated image filenames belonging to 2 classes.
Found 365 validated image filenames belonging to 2 classes.


Found 365 validated image filenames belonging to 2 classes.


In [None]:
# Visualize the training process
history = model.history

figure, axis = plt.subplots(1, 2, figsize=(16, 5))
axis[0].plot(history.history['loss'], label='Training Loss')
axis[0].plot(history.history['val_loss'], label='Validation Loss')
axis[0].set_title('Training and Validation Loss')
axis[0].set_xlabel('Epochs')
axis[0].set_ylabel('Loss')
axis[0].legend()

axis[1].plot(history.history['accuracy'], label='Training Accuracy')
axis[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axis[1].set_title('Training and Validation Accuracy')
axis[1].set_xlabel('Epochs')
axis[1].set_ylabel('Accuracy')
axis[1].legend()

plt.show()