In [32]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.utils import to_categorical

In [33]:
# Шлях до основної папки, в якій знаходяться підпапки з файлами
data = pd.read_csv("dataset.csv")

In [34]:
# Розділ даних на навчальний, валідаційний та тестовий набори
train_data = data[data['image_path'].str.contains('train')]
valid_data = data[data['image_path'].str.contains('valid')]
test_data = data[data['image_path'].str.contains('test')]

In [35]:
# Параметри зображень
img_height, img_width = 128, 128
batch_size = 8


In [73]:
# Ініціалізація токенізатора для міток
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['label'])

# Кодування міток
train_encoded_labels = tokenizer.texts_to_sequences(train_data['label'])
valid_encoded_labels = tokenizer.texts_to_sequences(valid_data['label'])
test_encoded_labels = tokenizer.texts_to_sequences(test_data['label'])

# Визначення максимальної довжини послідовності
max_length = max(
    max(len(seq) for seq in train_encoded_labels),
    max(len(seq) for seq in valid_encoded_labels),
    max(len(seq) for seq in test_encoded_labels)
)

# Додавання фіксованої довжини послідовностей
train_encoded_labels = pad_sequences(train_encoded_labels, maxlen=max_length, padding='post')
valid_encoded_labels = pad_sequences(valid_encoded_labels, maxlen=max_length, padding='post')
test_encoded_labels = pad_sequences(test_encoded_labels, maxlen=max_length, padding='post')

# Перетворення в one-hot формат
num_classes = len(tokenizer.word_index) + 1  # Додаємо 1 для врахування нульового індексу
train_encoded_labels_one_hot = to_categorical(train_encoded_labels, num_classes=num_classes)
valid_encoded_labels_one_hot = to_categorical(valid_encoded_labels, num_classes=num_classes)
test_encoded_labels_one_hot = to_categorical(test_encoded_labels, num_classes=num_classes)

# Перевірка розмірів
print(f"Train data shape: {train_data.shape}")
print(f"Train encoded labels one-hot shape: {train_encoded_labels_one_hot.shape}")

print(f"Valid data shape: {valid_data.shape}")
print(f"Valid encoded labels one-hot shape: {valid_encoded_labels_one_hot.shape}")

print(f"Test data shape: {test_data.shape}")
print(f"Test encoded labels one-hot shape: {test_encoded_labels_one_hot.shape}")

def convert_to_one_hot(labels, num_classes):
    # Для кожного зразка перетворюємо у one-hot формат
    return [label for label in labels]

train_data['encoded_label'] = list(train_encoded_labels_one_hot)
valid_data['encoded_label'] = list(valid_encoded_labels_one_hot)
test_data['encoded_label'] = list(test_encoded_labels_one_hot)

# print(train_data['encoded_label'].apply(type).value_counts())
# print(train_data['encoded_label'].apply(type).unique())

# print(test_data['encoded_label'].apply(type).value_counts())
# print(test_data['encoded_label'].apply(type).unique())

# print(valid_data['encoded_label'].apply(type).value_counts())
# print(valid_data['encoded_label'].apply(type).unique())

def check_encoded_labels(data):
    for i in range(len(data)):
        print(f"Sample {i} encoded label shape:", np.array(data['encoded_label'].iloc[i]).shape)

check_encoded_labels(test_data)



Train data shape: (33, 3)
Train encoded labels one-hot shape: (33, 8, 28)
Valid data shape: (8, 3)
Valid encoded labels one-hot shape: (8, 8, 28)
Test data shape: (6, 3)
Test encoded labels one-hot shape: (6, 8, 28)
encoded_label
<class 'numpy.ndarray'>    33
Name: count, dtype: int64
[<class 'numpy.ndarray'>]
encoded_label
<class 'numpy.ndarray'>    6
Name: count, dtype: int64
[<class 'numpy.ndarray'>]
encoded_label
<class 'numpy.ndarray'>    8
Name: count, dtype: int64
[<class 'numpy.ndarray'>]
Sample 0 encoded label shape: (8, 28)
Sample 1 encoded label shape: (8, 28)
Sample 2 encoded label shape: (8, 28)
Sample 3 encoded label shape: (8, 28)
Sample 4 encoded label shape: (8, 28)
Sample 5 encoded label shape: (8, 28)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['encoded_label'] = list(train_encoded_labels_one_hot)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_data['encoded_label'] = list(valid_encoded_labels_one_hot)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['encoded_label'] = list(test_encoded_labels_one_hot)


In [85]:
def create_data_generators(train_data, valid_data, batch_size, img_height, img_width):
    train_datagen = ImageDataGenerator(
        rescale=1.0/255,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    test_datagen = ImageDataGenerator(rescale=1.0/255)

    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_data,
        x_col='image_path',
        y_col='encoded_label',
        target_size=(img_height, img_width),
        batch_size=batch_size,
        class_mode=None,  # 'raw' для one-hot міток
        shuffle=False
    )

    validation_generator = test_datagen.flow_from_dataframe(
        dataframe=valid_data,
        x_col='image_path',
        y_col='encoded_label',
        target_size=(img_height, img_width),
        batch_size=batch_size,
        class_mode=None,  # 'raw' для one-hot міток
        shuffle=False
    )
    
    return train_generator, validation_generator
    
train_generator, validation_generator = create_data_generators(train_data, valid_data, batch_size, img_height, img_width)
# # Отримання одного батчу з генератора
# x_batch, y_batch = next(train_generator)

# # Перевірка типів даних та форм
# print("x_batch type:", type(x_batch))
# print("x_batch shape:", x_batch.shape)
# print("y_batch type:", type(y_batch))
# print("y_batch shape:", y_batch.shape)

# Кількість кроків на етап
steps_per_epoch = train_generator.samples // batch_size
validation_steps = validation_generator.samples // batch_size

# print(f"Steps per epoch: {steps_per_epoch}")
# print(f"Validation steps: {validation_steps}")

# # Перевірка одного з елементів закодованих міток
# print("Encoded label example (one-hot):", train_data['encoded_label'].iloc[0])
# # Перевірка типів даних та форм
# print("Train data encoded labels type:", type(train_data['encoded_label'].iloc[0]))
# print("Train data encoded labels shape:", np.array(train_data['encoded_label'].tolist()).shape)


Found 33 validated image filenames.
Found 8 validated image filenames.


In [87]:
# Основна архітектура CNN
model = Sequential([
    Input(shape=(img_height, img_width, 3)),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Вихідний шар відповідно до кількості класів
])

# Компіляція моделі
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Перевірка архітектури моделі
model.summary()

In [88]:
history = model.fit(train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=10,
    validation_data=validation_generator,
    validation_steps=validation_steps
)

Epoch 1/10


ValueError: None values not supported.