In [17]:
import os
import pandas as pd
import numpy as np
import cv2
from PIL import Image, UnidentifiedImageError
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall


fake_path = r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\train\fake"
real_path = r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\train\real"

In [18]:

# def add_prefix_to_images(folder_path, prefix):
#     if not os.path.exists(folder_path):
#         print(f"Folder not found: {folder_path}")
#         return
    
#     for filename in os.listdir(folder_path):
#         file_path = os.path.join(folder_path, filename)
        
#         if os.path.isfile(file_path):  # Ensure it's a file
#             new_filename = prefix + filename
#             new_file_path = os.path.join(folder_path, new_filename)
            
#             os.rename(file_path, new_file_path)
#             print(f"Renamed: {filename} -> {new_filename}")

# add_prefix_to_images(fake_path, "fake_")
# add_prefix_to_images(real_path, "real_"

In [19]:
import random

# def create_csv_with_labels(fake_path, real_path, output_csv):
#     data = []
    
#     for filename in os.listdir(fake_path):
#         if os.path.isfile(os.path.join(fake_path, filename)):
#             data.append([filename, "fake"])
    
#     for filename in os.listdir(real_path):
#         if os.path.isfile(os.path.join(real_path, filename)):
#             data.append([filename, "real"])
    
#     random.shuffle(data)  # Shuffle the rows
#     df = pd.DataFrame(data, columns=["filename", "label"])
#     df.to_csv(output_csv, index=False)
#     print(f"CSV file created: {output_csv}")

# output_csv = r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\data_labels.csv"

# create_csv_with_labels(fake_path, real_path, output_csv)

In [20]:
img_dir = r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\train"

In [None]:
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv(r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\data_labels.csv")

encoder = LabelEncoder()
data['label'] = encoder.fit_transform(data['label'])

data.head()

In [None]:
from PIL import Image, UnidentifiedImageError
import os
import numpy as np

def load_and_preprocess_images(data, img_size=(128, 128), max_size=5e6):
    images, labels, valid_filenames = [], [], []
    
    for _, row in data.iterrows():
        img_path = row['file_path'] 
        try:
            # if os.path.getsize(img_path) > max_size:
            #     print(f"Skipping large image: {img_path}")
            #     continue
            
            # Load image
            img = Image.open(img_path)

            if img.mode == "P":
                img = img.convert("RGBA")

            img = img.convert("RGB").resize(img_size)
            img = np.array(img) / 255.0  
            images.append(img)
            labels.append(row['label'])
            valid_filenames.append(img_path)

        except (UnidentifiedImageError, OSError, ValueError) as e:
            print(f"Skipping corrupted image: {img_path} due to error: {e}")
            continue
    
    return np.array(images), np.array(labels), valid_filenames


img_dir = r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\train"
X, y, valid_filenames = load_and_preprocess_images(data)

In [23]:
csv_path=r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\data_labels_cleaned.csv"
data = data[data['file_path'].isin(valid_filenames)]
data.to_csv(csv_path, index=False)


In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
import pandas as pd
import cv2
import numpy as np
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

csv_path = r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\data_labels_cleaned.csv"
df = pd.read_csv(csv_path)

df['file_path'] = df['file_path'].astype(str) 
df = df[df['file_path'].apply(os.path.exists)] 

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

def preprocess_image(image_path, label):
    image_path = image_path.decode('utf-8')  

    if not os.path.exists(image_path):  
        print(f"Skipping missing image: {image_path}")
        return np.zeros((128, 128, 3), dtype=np.float32), np.float32(0)  

    image = cv2.imread(image_path)
    if image is None:
        print(f"Skipping unreadable image: {image_path}")
        return np.zeros((128, 128, 3), dtype=np.float32), np.float32(0)  

    image = cv2.resize(image, (128, 128)) 
    image = image.astype(np.float32) / 255.0  
    return image, np.array(label, dtype=np.float32)


def create_tf_dataset(df, batch_size=32):
    image_paths = df['file_path'].values
    labels = df['label'].values.astype(np.float32)

    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))

    def process_path(path, label):
        image, label = tf.numpy_function(
            func=preprocess_image, inp=[path, label], Tout=(tf.float32, tf.float32)
        )
        image.set_shape((128, 128, 3))
        label.set_shape(())
        return image, label

    dataset = dataset.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

train_dataset = create_tf_dataset(train_df)
val_dataset = create_tf_dataset(val_df)

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),

    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') 
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(train_dataset, validation_data=val_dataset, epochs=50, callbacks=[early_stopping])

model.save(r"C:\Users\aryes\OneDrive\Desktop\vista-25\saved_model.h5")


In [34]:
import tensorflow as tf
import cv2
import numpy as np
import os
import pandas as pd

model = tf.keras.models.load_model(r"C:\Users\aryes\OneDrive\Desktop\vista-25\saved_model.h5")

def preprocess_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Skipping unreadable image: {image_path}")
        return None
    
    image = cv2.resize(image, (128, 128))
    image = image.astype(np.float32) / 255.0
    return image

def test_and_save_results(test_dir, model, output_csv_path, batch_size=32):
    image_paths = [os.path.join(test_dir, fname) for fname in os.listdir(test_dir) if fname.endswith(('.jpg', '.png'))]
    
    results = []
    
    for image_path in image_paths:
        image = preprocess_image(image_path)
        if image is None:
            continue

        image = np.expand_dims(image, axis=0)
        
        prediction = model.predict(image)
        predicted_label = 1 if prediction > 0.5 else 0
        predicted_label_str = 'Fake' if predicted_label == 1 else 'Real'
        
        results.append({
            'image_path': image_path,
            'prediction': predicted_label_str,
            'probability': prediction[0][0]
        })

    results_df = pd.DataFrame(results)

    results_df.to_csv(output_csv_path, index=False)
    print(f"Test results saved to {output_csv_path}")

test_dir = r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\test"
output_csv_path = r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\test_results.csv"
test_and_save_results(test_dir, model, output_csv_path)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67

In [40]:
import pandas as pd
import os

# Read the original CSV file
df = pd.read_csv(r'C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\test_results.csv')

# Extract image number from the full path
df['image_id'] = df['image_path'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])

# Convert prediction to binary label (0 for Fake, 1 for Real)
df['label'] = (df['prediction'] == 'Real').astype(int)

# Create the new submission DataFrame with only image_id and label
submission_df = df[['image_id', 'label']]

# Sort the DataFrame by image_id
submission_df = submission_df.sort_values('image_id')
submission_df = df[['image_id', 'label']].sort_values('image_id')
# Save the new submission CSV
submission_df.to_csv('new_submission.csv', index=False)

print("New submission CSV created successfully!")
print("\nTotal number of images:", len(submission_df))
print("\nLabel distribution:")
print(submission_df['label'].value_counts())

# Display the first few rows to verify
print("\nFirst few rows of the new submission file:")
submission_df.head()


New submission CSV created successfully!

Total number of images: 11999

Label distribution:
label
0    6470
1    5529
Name: count, dtype: int64

First few rows of the new submission file:


Unnamed: 0,image_id,label
0,0,1
1,1,0
2,10,0
3,100,1
4,1000,1


In [41]:
import pandas as pd
import os

# Load the original CSV
df = pd.read_csv(r"C:\Users\aryes\OneDrive\Desktop\vista-25\dataset\test_results.csv")

# Extract numeric image ID from the file name (e.g., "100.jpg" -> 100)
df["image_id"] = df["image_path"].apply(lambda x: int(os.path.basename(x).split(".")[0]))

# Rename "probability" to "confidence"
df.rename(columns={"probability": "confidence"}, inplace=True)

# Create a new DataFrame with the desired columns
new_df = df[["image_id", "prediction", "confidence"]]

# Sort by image_id in ascending order
new_df = new_df.sort_values("image_id")

# Save to a new CSV file
new_df.to_csv("image_prediction_confidence.csv", index=False)

print("New CSV file created: image_prediction_confidence.csv")


New CSV file created: image_prediction_confidence.csv


New CSV file created: image_prediction_confidence_mobilenetv2.csv
