In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
import numpy as np
import pandas as pd
import logging
import warnings as warnings
import random
import pickle

import tensorflow as tf
from tensorflow.keras.applications import Xception
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ReduceLROnPlateau,
    TensorBoard,
    ModelCheckpoint,
    Callback
)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [2]:
# Configuration and Setup

# Suppress warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG level for detailed logging
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Set seeds for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# Enable memory growth for GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Enabled memory growth for {len(gpus)} GPU(s).")
    except RuntimeError as e:
        print(f"Error setting memory growth: {e}")


Enabled memory growth for 2 GPU(s).


In [3]:
test_dir = '/kaggle/input/ai-knight/dataset/test'
train_dir = '/kaggle/input/ai-knight/dataset/train'

IMG_SIZE = (224,224)
BATCH_SIZE = 32

In [4]:
# Data Loading and Preprocessing

# Load data
# Load training data
train_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir,
    validation_split=0.2,            # Reserve 20% of data for validation
    subset="training",               # Load the training subset
    seed=42,                         # Set seed for reproducibility
    labels="inferred",               # Automatically assigns labels
    label_mode="int",                # Outputs integer labels
    image_size=IMG_SIZE,             # Resize images to target size
    batch_size=BATCH_SIZE,           # Batch size
    shuffle=True                     # Shuffle the data
)

#load val data '
val_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir,
    validation_split=0.2,            # Reserve 20% of data for validation
    subset="validation",             # Load the validation subset
    seed=42,                         # Set seed for reproducibility
    labels="inferred",               # Automatically assigns labels
    label_mode="int",                # Outputs integer labels
    image_size=IMG_SIZE,             # Resize images to target size
    batch_size=BATCH_SIZE,           # Batch size
    shuffle=True                     # Shuffle the data
)
#fake - 0, real - 1

Found 100000 files belonging to 2 classes.
Using 80000 files for training.
Found 100000 files belonging to 2 classes.
Using 20000 files for validation.


In [5]:
print(f"Training batches: {len(train_dataset)}")
print(f"Validation batches: {len(val_dataset)}")
print(train_dataset.class_names)  # Output: ['fake', 'real']

Training batches: 2500
Validation batches: 625
['fake', 'real']


In [6]:
#build the model 

input_shape = IMG_SIZE + (3,)
base_model = Xception(include_top=False, weights='imagenet', input_shape=input_shape)
base_model.trainable = False  # Freeze the base model
# print("Loaded EfficientNetB3 base model with ImageNet weights.")

# Add custom layers on top of the base model
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

# Add the output layer
output = Dense(1, activation='sigmoid')(x)  # Single unit with sigmoid activation for binary classification

# Define the complete model
model = Model(inputs=base_model.input, outputs=output)
print("Model architecture created.")

# Print model summary
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m83683744/83683744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Model architecture created.


In [7]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),  # Use Adam optimizer with a learning rate of 1e-3 
    loss='binary_crossentropy',                              # Binary cross-entropy loss for binary classification
    metrics=['accuracy']                                     # Track accuracy during training
)

In [8]:
# Define Callbacks

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',                # Monitor validation loss
    patience=5,                        # Stop training if no improvement for 5 epochs
    restore_best_weights=True,         # Restore model weights from the epoch with the best value
    verbose=1                          # Print logs when stopping
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',                # Monitor validation loss
    factor=0.5,                        # Reduce learning rate by half
    patience=2,                        # Wait for 2 epochs before reducing
    verbose=1                          # Print logs about learning rate changes
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='best_model.keras',       # Filepath to save the best model
    monitor='val_loss',                # Monitor validation loss
    save_best_only=True,               # Save only if the model improves
    verbose=1                          # Print logs when saving
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir='./logs',                  # Directory to save logs
    histogram_freq=1                   # Frequency (in epochs) to compute histograms
)

class BatchLoggingCallback(tf.keras.callbacks.Callback): #can be removed 
    def __init__(self, log_every=10):
        super(BatchLoggingCallback, self).__init__()
        self.log_every = log_every

    def on_train_batch_end(self, batch, logs=None):
        if (batch + 1) % self.log_every == 0:
            log_message = f"Batch {batch + 1}: " + ", ".join([f"{k}={v:.4f}" for k, v in logs.items()])
            print(log_message)

batch_logging = BatchLoggingCallback(log_every=10)



callbacks = [early_stop, reduce_lr, checkpoint, tensorboard_callback, batch_logging]


In [9]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,                      # Set the maximum number of epochs
    callbacks=callbacks,             # Add callbacks to monitor and enhance training
    verbose=1
    #also give step per epoch
)

Epoch 1/10
[1m   9/2500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:29[0m 108ms/step - accuracy: 0.4491 - loss: 0.9094Batch 10: accuracy=0.4656, loss=0.8857
[1m  19/2500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:20[0m 105ms/step - accuracy: 0.4719 - loss: 0.8844Batch 20: accuracy=0.5203, loss=0.8256
[1m  29/2500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:17[0m 104ms/step - accuracy: 0.4924 - loss: 0.8589Batch 30: accuracy=0.5490, loss=0.7820
[1m  39/2500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:15[0m 104ms/step - accuracy: 0.5072 - loss: 0.8404Batch 40: accuracy=0.5578, loss=0.7839
[1m  49/2500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:14[0m 104ms/step - accuracy: 0.5194 - loss: 0.8278Batch 50: accuracy=0.5769, loss=0.7768
[1m  59/2500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:13[0m 104ms/step - accuracy: 0.5295 - loss: 0.8186Batch 60: accuracy=0.5807, loss=0.7699
[1m  69/2500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:11[0m 104ms/step - accuracy: 0.5373 - loss: 0.8112Batch 70: accuracy=0

In [11]:
#Fine tuuning model start 


# Unfreeze the base model
base_model.trainable = True

# Freeze the first 100 layers (optional, based on your use case)
for layer in base_model.layers[:100]:
    layer.trainable = False

# Recompile the model with a lower learning rate
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),  # Reduced learning rate
    loss='binary_crossentropy', 
    metrics=['accuracy']
)

# Fine-tuning training
fine_tune_epochs = 10  # Adjust based on experiment results

history_fine = model.fit(
    train_dataset,                    # Your training dataset
    validation_data=val_dataset,      # Your validation dataset
    epochs=fine_tune_epochs,          # Number of fine-tuning epochs
    callbacks=callbacks,              # Previously defined callbacks
    verbose=1                         # Displays training progress
)


Epoch 1/10


KeyboardInterrupt: 

In [13]:
# Save the final trained model
final_model_path = '/kaggle/working/Xception_model.h5'
model.save(final_model_path)
print(f"Final model saved as {final_model_path}.")

Final model saved as /kaggle/working/Xception_model.h5.


In [14]:
from tensorflow.keras.models import load_model
model = load_model(final_model_path)
print("Model loaded successfully.")

Model loaded successfully.


In [15]:
test_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir,
    labels=None,                     # Test data may not have labels
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False                    # Keep test data in order for evaluation
)


Found 500 files.


In [16]:
test_file_paths = test_dataset.file_paths  # Use file_paths to get the paths of the test images

# Predict on the test dataset
predictions = model.predict(test_dataset)
predicted_classes = (predictions > 0.5).astype(int).flatten()  # Convert probabilities to binary classes

# Create a DataFrame for the predictions
output_df = pd.DataFrame({
    'ID': [os.path.basename(f) for f in test_file_paths],  # Extract filenames only
    'ImageType': predicted_classes  # Predictions (0 or 1)
})

# Save the predictions to a CSV file
output_csv_path = '/kaggle/working/test_predictions.csv'
output_df.to_csv(output_csv_path, index=False)
print(f"Predictions saved to {output_csv_path}.")

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 569ms/step
Predictions saved to /kaggle/working/test_predictions.csv.
