In [1]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50, EfficientNetB7
from tensorflow.keras.models import Model
import tensorflow.keras.layers as L
from google.colab import drive
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight
import pathlib
import zipfile
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import classification_report
import random
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
zip_folder = "/content/drive/MyDrive"

zip_files = ["inpainting.zip", "insight.zip", "text2img.zip", "wiki.zip"]

extract_dir = "/content/extracted_data"

In [6]:
for file_name in zip_files:
    zip_path = os.path.join(zip_folder, file_name)

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall("extracted_data/")
            print(f"Extracted: {file_name}")
    except zipfile.BadZipFile:
        print(f"Error: {file_name} is not a valid zip file.")

print("All files extracted successfully!")

Extracted: inpainting.zip
Extracted: insight.zip
Extracted: text2img.zip
Extracted: wiki.zip
All files extracted successfully!


In [7]:
data_dir = pathlib.Path("extracted_data")
image_paths = list(data_dir.glob('*/*/*.jpg'))
image_paths = [str(path) for path in image_paths]

In [8]:
class_names = ['inpainting', 'insight', 'text2img', 'wiki']
class_indices = {name: (0 if name == 'wiki' else 1) for name in class_names}
labels = [class_indices[pathlib.Path(path).parent.parent.name] for path in image_paths]

In [9]:
print("Class Names:", class_names)
print("Class Indices:", class_indices)
print("Total Image Paths:", len(image_paths))
print("First 5 Image Paths:", image_paths[:5])
print("First 5 Labels:", labels[:5])

Class Names: ['inpainting', 'insight', 'text2img', 'wiki']
Class Indices: {'inpainting': 1, 'insight': 1, 'text2img': 1, 'wiki': 0}
Total Image Paths: 120000
First 5 Image Paths: ['extracted_data/wiki/35/3501535_1936-03-24_2011.jpg', 'extracted_data/wiki/35/12109435_1957-07-21_2006.jpg', 'extracted_data/wiki/35/2585235_1935-10-29_1962.jpg', 'extracted_data/wiki/35/2173335_1983-10-21_2010.jpg', 'extracted_data/wiki/35/1021335_1964-09-24_2006.jpg']
First 5 Labels: [0, 0, 0, 0, 0]


In [10]:
combined = list(zip(image_paths, labels))
random.shuffle(combined)

# Unzip after shuffling
image_paths, labels = zip(*combined)
image_paths, labels = list(image_paths), list(labels)

In [11]:
total_size = len(image_paths)
train_size = int(0.7 * total_size)
val_size = int(0.1 * total_size)

In [12]:
train_paths, train_labels = image_paths[:train_size], labels[:train_size]
val_paths, val_labels = image_paths[train_size:train_size + val_size], labels[train_size:train_size + val_size]
test_paths, test_labels = image_paths[train_size + val_size:], labels[train_size + val_size:]

In [13]:
def load_image(img_path, label):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (224, 224))
    img = img / 255.0  # Normalize
    return img, label

In [14]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
train_dataset = train_dataset.shuffle(len(train_paths)).map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [15]:
val_dataset = tf.data.Dataset.from_tensor_slices((val_paths, val_labels))
val_dataset = val_dataset.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [16]:
test_dataset = tf.data.Dataset.from_tensor_slices((test_paths, test_labels))
test_dataset = test_dataset.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [17]:
from collections import Counter

# Count occurrences of each class in train, validation, and test sets
train_class_counts = Counter(train_labels)
val_class_counts = Counter(val_labels)
test_class_counts = Counter(test_labels)

print("Train Class Distribution:", train_class_counts)
print("Validation Class Distribution:", val_class_counts)
print("Test Class Distribution:", test_class_counts)


Train Class Distribution: Counter({1: 62791, 0: 21209})
Validation Class Distribution: Counter({1: 9032, 0: 2968})
Test Class Distribution: Counter({1: 18177, 0: 5823})


In [18]:
def augment_data(x, y):
    x = tf.image.random_flip_left_right(x)
    x = tf.image.random_flip_up_down(x)
    x = tf.image.random_brightness(x, max_delta=0.1)
    x = tf.image.random_contrast(x, lower=0.9, upper=1.1)
    return x, y
train_dataset = train_dataset.map(augment_data)

In [19]:
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1
)

In [20]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)

class_weights = dict(enumerate(class_weights))
class_weights

{0: 1.980291385732472, 1: 0.6688856683282636}

In [21]:
def make_model():
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    base_model.trainable = True
    x = L.GlobalAveragePooling2D()(base_model.output)
    x = L.Dense(256, activation='relu')(x)
    x = L.Dense(64, activation='relu')(x)
    x = L.Dense(1, activation='sigmoid')(x)

    model = Model(inputs=base_model.input, outputs=x)

    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = make_model()
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step


In [22]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

In [23]:
model.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=[lr_scheduler,early_stopping], class_weight=class_weights)

Epoch 1/10
[1m2625/2625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m870s[0m 305ms/step - accuracy: 0.7506 - loss: 0.4630 - val_accuracy: 0.5375 - val_loss: 1.3456 - learning_rate: 1.0000e-04
Epoch 2/10
[1m2625/2625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m791s[0m 301ms/step - accuracy: 0.8698 - loss: 0.2810 - val_accuracy: 0.7368 - val_loss: 0.5758 - learning_rate: 1.0000e-04
Epoch 3/10
[1m2625/2625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m801s[0m 301ms/step - accuracy: 0.9152 - loss: 0.1899 - val_accuracy: 0.9311 - val_loss: 0.1556 - learning_rate: 1.0000e-04
Epoch 4/10
[1m2625/2625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m802s[0m 305ms/step - accuracy: 0.9595 - loss: 0.0966 - val_accuracy: 0.9643 - val_loss: 0.0817 - learning_rate: 1.0000e-04
Epoch 5/10
[1m2625/2625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m862s[0m 305ms/step - accuracy: 0.9723 - loss: 0.0668 - val_accuracy: 0.7506 - val_loss: 1.2082 - learning_rate: 1.0000e-04
Epoch 6/10
[1m2625/

<keras.src.callbacks.history.History at 0x7bc23edf20d0>

In [24]:
y_pred_probs = model.predict(test_dataset)
y_pred = (y_pred_probs > 0.5).astype(int)

[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 102ms/step


In [25]:
print(classification_report(test_labels, y_pred, target_names=["wiki (0)", "other (1)"]))

              precision    recall  f1-score   support

    wiki (0)       0.95      0.98      0.96      5823
   other (1)       0.99      0.98      0.99     18177

    accuracy                           0.98     24000
   macro avg       0.97      0.98      0.97     24000
weighted avg       0.98      0.98      0.98     24000



In [26]:
model_save_path = "/content/drive/MyDrive/ResNet50_Latest"
model.save(model_save_path + ".keras")


In [27]:
model.save("ResNet50_finetuned_Latest.keras")

In [28]:
final_model = tf.keras.models.load_model('ResNet50_finetuned_Latest.keras')
y_pred_probs = final_model.predict(test_dataset)
y_pred = (y_pred_probs > 0.5).astype(int)

[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 95ms/step


In [29]:
print(classification_report(test_labels, y_pred, target_names=["wiki (0)", "other (1)"]))

              precision    recall  f1-score   support

    wiki (0)       0.95      0.98      0.96      5823
   other (1)       0.99      0.98      0.99     18177

    accuracy                           0.98     24000
   macro avg       0.97      0.98      0.97     24000
weighted avg       0.98      0.98      0.98     24000



In [30]:
model.save('ResNet50_latest.h5')


