In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#Import Bibliotheques - Import Library
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from transformers import ViTForImageClassification
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
#Importation des fichiers csv et création du chemin de récupération des images
#Import CSV files and create recuperation path for images

folder_path = "/kaggle/input/ai-vs-human-generated-dataset"
train_csv_path = '/kaggle/input/ai-vs-human-generated-dataset/train.csv'
train_csv = pd.read_csv(train_csv_path)

def get_image_path(folder_path, image_name) :
    return os.path.join(folder_path, image_name)

#Ajout du chemin complet de l'image dans file_name
#Add complet path of the image in the colomns file_name
train_csv['file_name'] = train_csv['file_name'].apply(lambda name : get_image_path(folder_path, name))


print(train_csv.head())

   Unnamed: 0                                          file_name  label
0           0  /kaggle/input/ai-vs-human-generated-dataset/tr...      1
1           1  /kaggle/input/ai-vs-human-generated-dataset/tr...      0
2           2  /kaggle/input/ai-vs-human-generated-dataset/tr...      1
3           3  /kaggle/input/ai-vs-human-generated-dataset/tr...      0
4           4  /kaggle/input/ai-vs-human-generated-dataset/tr...      1


In [4]:
#Chargement d'un Dataset léger pour tester mon code
#Low weight dataset loding - in order to test my code before the big loading
train_test_csv = train_csv.head(1000)
train_data_set = train_test_csv

#Chargement du Dataset complet
#Full dataset loading
train_data_set = train_csv

df = pd.DataFrame(train_data_set)

#Dataset splité
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [5]:
# Définir des transformations
#Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Taille requise pour Vit
    transforms.RandomHorizontalFlip(),  # Rotation aléatoire jusqu'à 15°
    transforms.RandomRotation(15),
    transforms.ToTensor(),       # Convertir en tenseur PyTorch
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [6]:
#Création d'un DataLoader avec une classe
#Dataloader own class creation
class CustomDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['file_name']
        label = self.dataframe.iloc[idx]['label']

        # Charger l'image
        # load image
        image = Image.open(img_path).convert("RGB")

        # Appliquer les transformations si spécifiées
        # Apply transformation
        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.float32)

In [7]:
#Création d'une instance de DataSet customisé
#Dataset with transformation creation

#custom_dataset = CustomDataset(df, transform=transform)
#Dataset splité
train_dataset = CustomDataset(train_df, transform=transform)
val_dataset = CustomDataset(val_df, transform=transform)
test_dataset = CustomDataset(test_df, transform=transform)

In [8]:
#Création d'un Dataloader pour itération des données
#Dataloader for data iteration

#custom_loader = DataLoader(custom_dataset, batch_size=32, shuffle=True)
#Dataset splité
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [9]:
# Charger le modèle ConvNeXt pré-entraîné d'origine
model = models.convnext_base(weights="DEFAULT")
# Adapter la dernière couche pour une sortie binaire
num_features = model.classifier[2].in_features
model.classifier[2] = nn.Sequential(
    nn.Linear(num_features, 1),  # Une seule sortie pour binaire
    nn.Sigmoid()  # Activation sigmoïde pour probabilité
)



Downloading: "https://download.pytorch.org/models/convnext_base-6075fbad.pth" to /root/.cache/torch/hub/checkpoints/convnext_base-6075fbad.pth
100%|██████████| 338M/338M [00:01<00:00, 233MB/s]


In [10]:
#import os
#print(os.listdir("/kaggle/input/convnext_model_sys_v2_epoque4/pytorch/default/1/"))

In [11]:
# Charger le modèle sauvegardé
#import kagglehub
#convnext_model_sys_pytorch_default_1_path = kagglehub.model_download('/kaggle/input/convnext_model_sys/pytorch/default/1/convnext_model_SYS.pth')

#model.load_state_dict(torch.load("/kaggle/input/convnext_model_sys_v2_epoque4/pytorch/default/1/convnext_model_SYS_V2_Epoque4.pth"))
#model.train()  # Remettre le modèle en mode entraînement


In [12]:
# Déplacer le modèle sur le GPU si disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#print(model)

In [13]:
#Définir la fonction de perte et l'optimiseur
#Define lost function and optimiser
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.AdamW(model.parameters(), lr=0.05)

In [14]:
#Entraînement du modèle
#Fit model
# Entraînement du modèle
for epoch in range(2):  # Nombre d'époques
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        labels = labels.view(-1, 1)  # Adapter les dimensions des labels

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass et optimisation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Évaluation sur l'ensemble de validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            labels = labels.view(-1, 1)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            predictions = (outputs > 0.5).float()
            correct += (predictions.view(-1) == labels.view(-1)).sum().item()
            total += labels.size(0)

    val_accuracy = 100 * correct / total
    print(f"Époque {epoch+1}, Perte entraînement : {running_loss/len(train_loader):.4f}, Perte validation : {val_loss/len(val_loader):.4f}, Exactitude validation : {val_accuracy:.2f}%")


Époque 1, Perte entraînement : 0.7744, Perte validation : 0.7126, Exactitude validation : 50.04%
Époque 2, Perte entraînement : 0.7410, Perte validation : 0.7294, Exactitude validation : 49.96%


In [15]:
# Évaluation du modèle
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predictions = (outputs > 0.5).float()
        correct += (predictions.view(-1) == labels).sum().item()
        total += labels.size(0)

print(f"Exactitude : {100 * correct / total:.2f}%")

Exactitude : 50.43%


In [16]:
# Sauvegarder le modèle
torch.save(model.state_dict(), 'convnext_SYS_Ep2_lr05.pth')

In [17]:
# Charger le modèle pour l'inférence
#model.load_state_dict(torch.load('convnext_model_SYS.pth', map_location=device))
#model.eval()

**CLASSIFICATION DES IMAGES TEST NON ETIQUETEES - UNLABELED IMAGES CLASSIFICATION**

In [18]:
#Préparation des données de test final
#Final test data preparation
test_csv_path = '/kaggle/input/ai-vs-human-generated-dataset/test.csv'
test_csv = pd.read_csv(test_csv_path)
id_csv = test_csv['id']
test_csv['id'] = test_csv['id'].apply(lambda name : get_image_path(folder_path, name))

#Chargement d'un Dataset léger pour tester mon code
#Low weight dataset loding - in order to test my code before the big loading
test_test_csv = test_csv.head(50)
test_data_set = test_test_csv

#Chargement du Dataset complet
#Full dataset loading
test_data_set = test_csv

#Création du DataFrame de test
#Test Dataframe creation
df_test = pd.DataFrame(test_data_set)
print(id_csv.head())

0    test_data_v2/1a2d9fd3e21b4266aea1f66b30aed157.jpg
1    test_data_v2/ab5df8f441fe4fbf9dc9c6baae699dc7.jpg
2    test_data_v2/eb364dd2dfe34feda0e52466b7ce7956.jpg
3    test_data_v2/f76c2580e9644d85a741a42c6f6b39c0.jpg
4    test_data_v2/a16495c578b7494683805484ca27cf9f.jpg
Name: id, dtype: object


In [19]:
#Création d'un DataLoader_test avec une classe
#Dataloader own class creation
class CustomDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['id']

        # Charger l'image
        # load image
        image = Image.open(img_path).convert("RGB")

        # Appliquer les transformations si spécifiées
        # Apply transformation
        if self.transform:
            image = self.transform(image)

        return image, self.dataframe.iloc[idx]['id']


In [20]:
#Création d'une instance de DataSet customisé
#Dataset with transformation creation
custom_dataset_test = CustomDataset(df_test, transform=transform)

#Création d'un Dataloader pour itération des données
#Dataloader for data iteration
custom_loader_test = DataLoader(custom_dataset_test, batch_size=16, shuffle=True, num_workers=4)

In [21]:
# Liste pour stocker les résultats
results = []
id_counter = 0  # Initialisation du compteur d'ID

In [22]:
# Itérer sur le DataLoader et faire des prédictions
# Prédictions sur les données non étiquetées
model.eval()
with torch.no_grad():
    for i, (inputs, _) in enumerate(custom_loader_test):  # On ignore le label puisque les images sont non étiquetées
        inputs = inputs.to(device)
        outputs = model(inputs)
        predictions = (outputs > 0.5).int()  # Convertir les probabilités en 0/1

        # Ajouter chaque prédiction au tableau des résultats
        for pred in predictions.cpu().numpy():
            #results.append({'id': len(results) + 1, 'label': int(pred)})
            results.append({'id': id_csv[id_counter], 'label': int(pred.item() if hasattr(pred, 'item') else pred)})
            id_counter += 1  # Incrémenter le compteur


# Convertir en DataFrame
results_df = pd.DataFrame(results)

# Sauvegarder le DataFrame en fichier CSV
output_csv_path = "predictions.csv"
#results_df.to_csv(output_csv_path, index=False, sep=';')
results_df.to_csv(output_csv_path, index=False, sep=',')

print(f"Fichier CSV généré : {output_csv_path}")
print(results_df)


Fichier CSV généré : predictions.csv
                                                     id  label
0     test_data_v2/1a2d9fd3e21b4266aea1f66b30aed157.jpg      0
1     test_data_v2/ab5df8f441fe4fbf9dc9c6baae699dc7.jpg      0
2     test_data_v2/eb364dd2dfe34feda0e52466b7ce7956.jpg      0
3     test_data_v2/f76c2580e9644d85a741a42c6f6b39c0.jpg      0
4     test_data_v2/a16495c578b7494683805484ca27cf9f.jpg      0
...                                                 ...    ...
5535  test_data_v2/483412064ff74d9d9472d606b65976d9.jpg      0
5536  test_data_v2/c0b49ba4081a4197b422dac7c15aea7f.jpg      0
5537  test_data_v2/01454aaedec140c0a3ca1f48028c41cf.jpg      0
5538  test_data_v2/e9adfea8b67e4791968c4c2bdd8ec343.jpg      0
5539  test_data_v2/ba8f4198e8d74d3394fa56c56af23442.jpg      0

[5540 rows x 2 columns]


In [23]:
from IPython.display import FileLink
FileLink("predictions.csv")