In [None]:
# import os
# import shutil
# # Определение исходной директории и целевой директории
# source_directory = "/kaggle/input/all-data-big"  # Замените 'your-dataset-name' на имя вашего датасета
# destination_directory = "/kaggle/working/all-data-big"  # Имя целевой директории в /kaggle/working/

#  # Создание целевой директории, если она не существует
# os.makedirs(destination_directory, exist_ok=True)

#  # Функция для копирования файлов и директорий
# def copy_files(source_dir, destination_dir):
#     for item in os.listdir(source_dir):
#         source_path = os.path.join(source_dir, item)
#         destination_path = os.path.join(destination_dir, item)

#         if os.path.isdir(source_path):
#             shutil.copytree(source_path, destination_path)  # Копирование директорий
#         else:
#             shutil.copy2(source_path, destination_path)  # Копирование файлов

#  # Вызов функции копирования
# copy_files(source_directory, destination_directory)

In [None]:
# Basic data manipulations
import pandas as pd
import numpy as np


# Handling images
from PIL import Image
import matplotlib.pyplot as plt

# Handling paths

import time

# Pytorch essentials
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import models
from torchvision.datasets import ImageFolder
! pip install torchsummary
import torchsummary

# Pytorch essentials for datasets.
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Pytorch way of data augmentation.
import torchvision
from torchvision import datasets, models, transforms, utils
from torchvision.transforms import v2

#import cv2
import os
from glob import glob
from tqdm import tqdm
import shutil
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix , accuracy_score, classification_report
import seaborn as sns
from pathlib import Path

In [None]:
!nvidia-smi

In [None]:
all_df = pd.DataFrame({"path":[],"label":[], "class_id":[]})
train_path = '/kaggle/input/all-data-big'
label_list = [ 'not_smoking','smoking']

# Получаем список всех папок внутри основной директории
category_folders = [f for f in os.listdir(train_path) if os.path.isdir(os.path.join(train_path, f))]

# Проходим по каждой папке
for folder in category_folders:
    folder_path = os.path.join(train_path, folder)

    # Получаем список всех файлов в текущей папке
    img_list = glob(os.path.join(folder_path, '*'))

    # Обрабатываем каждый файл в папке
    for img in img_list:
        #file_name = os.path.splitext(img)[0].split("/")[-1]
        if folder == 'other':
            new_data =pd.DataFrame({"path":img,"label":label_list[0], "class_id":0}, index=[1])
            all_df = pd.concat([all_df, new_data], ignore_index=True)
        else:
            new_data =pd.DataFrame({"path":img,"label":label_list[1], "class_id":1}, index=[1])
            all_df = pd.concat([all_df, new_data], ignore_index=True)


all_df[["path"]] = all_df[["path"]].astype(str)
all_df[["label"]] = all_df[["label"]].astype(str)
all_df[["class_id"]] = all_df[["class_id"]].astype(int)
all_df.head()

In [None]:
all_df = all_df[all_df['path'].str.endswith('jpg')]
all_df = all_df[all_df['path'] != '/kaggle/input/all-data-big/smoke/rome-actress-elizabeth-taylor-takes-a-break-in-the-dressing-room-of-the-cleopatra-set-at-the.jpg']

In [None]:
len(all_df)

In [None]:
#all_df = all_df.iloc[:-8057]
all_df = all_df.sample(frac=1,random_state=42).reset_index(drop=True)
train_df , temp_df = train_test_split(all_df, test_size=0.15, random_state = 42)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state = 42)

In [None]:
print(f'train data:{len(train_df)}')
print(f'val data:{len(val_df)}')
print(f'test data:{len(test_df)}')

In [None]:
sns.countplot(x = val_df["label"])
plt.xticks(rotation = 50)

In [None]:
show_imgs = 15
idx = np.random.randint(0, len(train_df),size=show_imgs)
fig, axes = plt.subplots(show_imgs//5, 5, figsize=(15,10))
axes = axes.flatten()
for i, ax in enumerate(axes):
    full_path = train_df.iloc[idx[i]]['path']
    ax.imshow(plt.imread(full_path))
    ax.set_title(train_df.iloc[idx[i]]['label'])
    ax.set_axis_off()

In [None]:
train_transforms = v2.Compose([
    v2.Resize(256),
    v2.RandomResizedCrop(size=(224, 224), antialias=True),
    v2.RandomHorizontalFlip(p=0.5),
    v2.RandomVerticalFlip(p=0.5),
    # v2.RandomRotation(degrees=(-20, 20)),
    v2.RandomAffine(degrees=(-10, 10), translate=(0.1, 0.1), scale=(0.9, 1.1)),
    v2.RandomErasing(p=0.5, scale=(0.1,0.15)),
    v2.PILToTensor(),
    v2.ToDtype(torch.float32),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_transforms = v2.Compose([
    v2.Resize((224,224)),
    v2.PILToTensor(),
    v2.ToDtype(torch.float32),
    v2.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
])

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, transforms_):
        self.df = dataframe
        # We'll use transforms for data augmentation and converting PIL images to torch tensors.
        self.transforms_ = transforms_

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        image_path = self.df.iloc[index]['path']
        img = Image.open(image_path).convert("RGB")


        transformed_img = self.transforms_(img)

        class_id = self.df.iloc[index]['class_id']

        return transformed_img, class_id

BATCH_SIZE = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_workers = 2 if device=='cuda' else 4 # fixed by kaggle notebook


train_dataset = MyDataset(train_df, train_transforms)
val_dataset = MyDataset(val_df, test_transforms)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_size = len(label_list) # class_size = 2
# Use Swin Transformer (models.swin_v2_s)
model = models.swin_v2_b(weights='DEFAULT')

model.head = nn.Linear(in_features=model.head.in_features, out_features=class_size)


model(torch.randn((16,3,224,224))).shape

In [None]:
def train(dataloader, model, loss_fn, optimizer, lr_scheduler):
    size = 0
    num_batches = len(dataloader)
    model.train()
    epoch_loss = 0.0
    epoch_correct = 0
    for (data_,target_) in dataloader:
        target_ = target_.type(torch.LongTensor)
        data_, target_ = data_.to(device), target_.to(device)


        optimizer.zero_grad()


        outputs = model(data_)


        loss = loss_fn(outputs,target_)


        loss.backward()


        optimizer.step()


        epoch_loss = epoch_loss + loss.item()


        _,pred = torch.max(outputs,dim=1)


        epoch_correct = epoch_correct + torch.sum(pred == target_).item()


        size += target_.shape[0]


    lr_scheduler.step()


    return epoch_correct/size, epoch_loss/num_batches

In [None]:
def test(dataloader, model, loss_fn):
    # size = len(dataloader.dataset) # number of samples
    size = 0
    num_batches = len(dataloader) # batches per epoch
    epoch_loss = 0.0
    epoch_correct = 0
    with torch.no_grad():

        model.eval()
        for (data_,target_) in dataloader:
            target_ = target_.type(torch.LongTensor)
            data_, target_ = data_.to(device), target_.to(device)

            # Forward propagation
            outputs = model(data_)

            # Computing loss
            loss = loss_fn(outputs,target_)
            # Computing statistics.
            epoch_loss = epoch_loss + loss.item()
            _,pred = torch.max(outputs,dim=1)
            epoch_correct = epoch_correct + torch.sum(pred == target_).item()
            size += target_.shape[0]
    return  epoch_correct/size, epoch_loss/num_batches

In [None]:
123

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot(logs['train_loss'],label='Train_Loss')
plt.plot(logs['val_loss'],label='Validation_Loss')
plt.title('Train_Loss & Validation_Loss',fontsize=20)
plt.legend()
plt.subplot(1,2,2)
plt.plot(logs['train_acc'],label='Train_Accuracy')
plt.plot(logs['val_acc'],label='Validation_Accuracy')
plt.title('Train_Accuracy & Validation_Accuracy',fontsize=20)
plt.legend()

In [None]:
test_dataset = MyDataset(test_df, test_transforms)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
model.load_state_dict(torch.load('checkpoints/best.pth'))
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for (data_,target_) in tqdm(test_loader):
        target_ = target_.type(torch.LongTensor)
        data_, target_ = data_.to(device), target_.to(device)
        outputs = model(data_)
        _,pred = torch.max(outputs,dim=1)
        y_true.extend(target_.cpu().numpy())
        y_pred.extend(pred.cpu().numpy())
y_pred = np.array(y_pred)
y_true = np.array(y_true)

In [None]:
import numpy as np
import matplotlib.pyplot as plt




diff_indices = np.where(y_true != y_pred)[0]


show_imgs = len(diff_indices)


if show_imgs > 0:
    indices_to_show = np.random.choice(diff_indices, size=show_imgs, replace=False)

    
    fig, axes = plt.subplots(show_imgs // 5, 5, figsize=(15, 10))
    axes = axes.flatten()
    for i, ax in enumerate(axes[:show_imgs]):
        full_path = test_df.iloc[indices_to_show[i]]['path']
        ax.imshow(plt.imread(full_path))
        ax.set_title(test_df.iloc[indices_to_show[i]]['label'])
        ax.set_axis_off()
    plt.tight_layout()
    plt.show()
else:
    print("Нет различий между y_true и y_pred.")

In [None]:
ax= plt.subplot()
CM = confusion_matrix(y_true,y_pred)
sns.heatmap(CM, annot=True, fmt='g', ax=ax, cbar=False,cmap='RdBu_r',
            xticklabels= label_list, yticklabels=label_list)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
clf_report = classification_report(y_true, y_pred, target_names = label_list)
print(clf_report)

In [None]:
model.load_state_dict(torch.load('checkpoints/best.pth'))
model.eval()
torch.save(model.state_dict(),'/kaggle/working/best_swin_v2_b.pth')

In [None]:
model.load_state_dict(torch.load('checkpoints/last.pth'))
model.eval()
torch.save(model.state_dict(),'/kaggle/working/last_swin_v2_b.pth')