# __Name__

Description & tasks

In [2]:
import itertools
import math 

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.activations import mish, relu, sigmoid, softmax
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.utils import to_categorical
from keras.applications.vgg16 import VGG16
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from keras import losses
from keras import metrics
from keras import models
from keras import optimizers
from sklearn.model_selection import train_test_split


2023-08-15 10:00:32.481837: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-15 10:00:32.504008: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-15 10:00:32.671606: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-15 10:00:32.672809: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


!!!! set randome

### __`OBTAIN` & `SCRUB`__ + __`EXPLORE`__ (DATASET)

##### Loading `the prepared data:`

- https://www.tensorflow.org/api_docs/python/tf/keras/datasets/fashion_mnist/load_data

In [3]:
(images_train, labels_train), (images_test, labels_test) = fashion_mnist.load_data()
# images_train.dtype, images_train[0].shape
NUM_FEATURES = images_train[0].shape[0] * images_train[0].shape[1]
# NUM_FEATURES
images_train, images_test = np.array(images_train, np.float32), np.array(images_test, np.float32)  # .astype('float32')
# print(min(images_train.min(), images_test.min()))  # 0  255
maximum = max(images_train.max(), images_test.max()) # np.max([images_train.max(), images_test.max()]) 
# Normalize (min = 0):
images_train, images_test = images_train / maximum, images_test / maximum
# images_train.shape, images_test.shape, labels_train.shape, labels_test.shape
max(labels_train.max(), labels_test.max()), min(labels_train.min(), labels_test.min())  # 0  9
NUM_CLASSES = len(set(np.concatenate((labels_train, labels_test), axis=0)))  # 10
# (10000, 10) into ((5000, 10), (5000, 10)):
images_valid, images_test, labels_valid, labels_test = train_test_split(
                                                                        images_test, 
                                                                        labels_test, 
                                                                        test_size=0.5,  # 50%
                                                                        shuffle=True, 
                                                                        stratify=labels_test
                                                                        )  # (stratify для рівномірного розподілу за значенням y)
labels_train = tf.one_hot(labels_train, depth=NUM_CLASSES).numpy()  # 5 into [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
labels_test = tf.one_hot(labels_test, depth=NUM_CLASSES).numpy()
labels_valid = tf.one_hot(labels_valid, depth=NUM_CLASSES).numpy()
# labels_train.shape, labels_test.shape, labels_valid.shape  # ((60000, 10), (5000, 10), (5000, 10))
images_train.shape, images_test.shape, images_valid.shape, labels_train.shape, labels_test.shape, labels_valid.shape

((60000, 28, 28),
 (5000, 28, 28),
 (5000, 28, 28),
 (60000, 10),
 (5000, 10),
 (5000, 10))

##### Loading by Dataset `from 'RAW-files':`

In [None]:
import os


PATH_IMGS = os.path.join(os.getcwd(), 'CIFAR-10-images-master', 'train')

ids = []  # full image paths
labels = []  # true mark 'images name-description' by label2index:
# label2index = {kind:num for num, kind in enumerate(os.listdir(PATH_IMGS))}
label2index = {}

for num, folder in enumerate(os.listdir(PATH_IMGS)):
    label2index[folder] = num
    for image_name in os.listdir(os.path.join(PATH_IMGS, folder))[:500]:  # limit 500 for each - for example
        ids.append(os.path.join(PATH_IMGS, folder, image_name))
        labels.append(label2index[folder])

NUM_CLASSES = num + 1
NUM_CLASSES

In [None]:
ids_train, ids_valid, y_train, y_valid = train_test_split(ids, labels, test_size=0.2, shuffle=True)

- https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence
- https://numpy.org/doc/stable/reference/random/generated/numpy.random.shuffle.html
- https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical

- https://docs.opencv.org/3.4/d4/da8/group__imgcodecs.html
- https://www.geeksforgeeks.org/python-opencv-cv2-imread-method/

In [None]:
class Dataset(tf.keras.utils.Sequence):
    def __init__(self, ids: list, y: list, shuffle: bool=True, batch_size: int=256) -> None:
        self.ids = ids  # list of imgs path
        self.y = y
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.ids))
        self.batch_size = batch_size
        self.num_features = self.num_features if self.__getitem__(0) else 0
        
        if shuffle:
            self.on_epoch_end()
    
    def __len__(self) -> int:
        return len(self.ids) // self.batch_size
    
    def __getitem__(self, idx: int) -> tuple:  # get one batch
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_ids = [self.ids[i] for i in indexes]
        batch_y = [tf.keras.utils.to_categorical(self.y[i], 10) for i in indexes]
        
        batch_X = []
        for i in range(self.batch_size):
            img = cv2.imread(batch_ids[i])  # i/o disk operation! but low RAM uses
            img = np.mean(img, axis=-1)  # RGB to grayscale

            batch_X.append(img.reshape(len(img[0]) * len(img)))  # into a one-dimensional vector
        
        self.num_features = len(img[0]) * len(img) # !!
        
        return np.array(batch_X), np.array(batch_y)
        
    def on_epoch_end(self) -> None:
        if self.shuffle:
            np.random.shuffle(self.indexes)

In [None]:
train_dataset = Dataset(ids_train, y_train, shuffle=True, batch_size=256)
# train_dataset[0]
NUM_FEATURES = train_dataset.num_features
valid_dataset = Dataset(ids_valid, y_valid, shuffle=False, batch_size=256)
NUM_FEATURES

- https://www.tutorialkart.com/opencv/python/opencv-python-resize-image/#gsc.tab=0
- https://stackoverflow.com/questions/64276472/valueerror-the-input-must-have-3-channels-got-input-shape-200-200-1

In [None]:
class Dataset(tf.keras.utils.Sequence):
    def __init__(self, X, y, shuffle=True, batch_size=256, to_size=32):
        self.X = X  # (60000, 28, 28)
        self.y = y  # (60000, 10)
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.X))
        self.batch_size = batch_size
        self.to_size = to_size
        
        if shuffle:
            self.on_epoch_end()
    
    def __len__(self):
        return math.ceil(self.y.shape[0] / self.batch_size)
    
    def __getitem__(self, idx):  # idx = batche's numder
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]

        batch_X_row = [self.X[i] for i in indexes]
        batch_y = [self.y[i] for i in indexes]

        # cv2.cvtColor(grayscale_image, cv2.COLOR_GRAY2RGB)
        batch_X = [cv2.cvtColor(cv2.resize(img, (self.to_size, self.to_size), interpolation=cv2.INTER_NEAREST), cv2.COLOR_GRAY2RGB) for img in batch_X_row]

        return np.array(batch_X), np.array(batch_y)
        
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

##### Loading `from prepared 'RAW-files'` into `ImageDataGenerator`:

In [4]:
# kaggle competitions download -c dogs-vs-cats

- https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.joinpath
- https://docs.python.org/uk/3/library/shutil.html

In [20]:
from pathlib import Path
import shutil
import sys


train_size = 1000  # % of num-files...
validation_size = 500  # next
test_size = 500  # next
src, dst = '/media/denys/ftb/sprojects/trainCatsDogs/train/', '/media/denys/ftb/sprojects/datas/helps/data/'


def create_dataset_directories(base_dir: str, categories: set) -> None:
    print('Start creation of dataset directories...')

    Path(base_dir).mkdir(parents=True, exist_ok=True)

    directories = ['train', 'validation', 'test']
    for directory in directories:
        sub_dir = Path(base_dir).joinpath(directory)
        Path(sub_dir).mkdir(parents=True, exist_ok=True)
        [Path(sub_dir.joinpath(c)).mkdir(parents=True, exist_ok=True) for c in categories]

    print('Done preparing directories.')


def copy_data(
              src: str, 
              dst: str, 
              example_name: str, 
              ext: str,
              start: int, 
              end: int
              ) -> None:
    fnames = [f'{example_name}.{i}.{ext}' for i in range(start, end)]
    print(f'Copying... into {dst}...')
    for fname in fnames:
        # print(f'Copying {fname} to {dst}')
        shutil.copyfile(
                        Path(src).joinpath(fname),
                        Path(dst).joinpath(fname)
                        )


def identify_categories(src_dir: str) -> set:
    all_files = [el.stem.split('.')[0] for el in Path(src_dir).iterdir() if el.is_file()]

    return set(all_files)


if Path(src).is_file() or Path(dst).is_file():
    print('Incorrect paths, there must be a folders.')

else:
    # images in src - all in one where filenames start with label (category)
    categories: set = identify_categories(src)
    create_dataset_directories(dst, categories)

    # if same extensions of images:
    ext: str = {0:el.suffix[1:] for el in Path(src).iterdir() if el.is_file()}.get(0, '')

    for category in categories:   # ! // \\ linux windows
        copy_data(src, f'{dst}/train/{category}/', category, ext, 0, train_size)
        copy_data(src, f'{dst}/validation/{category}/', category, ext, train_size, train_size + validation_size)
        copy_data(src, f'{dst}/test/{category}/', category, ext, train_size + validation_size, train_size + validation_size + test_size)


Start creation of dataset directories...
Done preparing directories.
Copying... into /media/denys/ftb/sprojects/datas/helps/data//train/cat/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//validation/cat/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//test/cat/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//train/dog/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//validation/dog/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//test/dog/...


###### __`main.py`__

In [8]:
from pathlib import Path
import shutil
import sys


def create_dataset_directories(base_dir: str, categories: set) -> None:
    print('Start creation of dataset directories...')

    Path(base_dir).mkdir(parents=True, exist_ok=True)

    directories = ['train', 'validation', 'test']
    for directory in directories:
        sub_dir = Path(base_dir).joinpath(directory)
        Path(sub_dir).mkdir(parents=True, exist_ok=True)
        [Path(sub_dir.joinpath(c)).mkdir(parents=True, exist_ok=True) for c in categories]

    print('Done preparing directories.')


def copy_data(
              src: str, 
              dst: str, 
              example_name: str, 
              ext: str,
              start: int, 
              end: int
              ) -> None:
    fnames = [f'{example_name}.{i}.{ext}' for i in range(start, end)]
    print(f'Copying... into {dst}...')
    for fname in fnames:
        # print(f'Copying {fname} to {dst}')
        shutil.copyfile(
                        Path(src).joinpath(fname),
                        Path(dst).joinpath(fname)
                        )


def identify_categories(src_dir: str) -> set:
    all_files = [el.stem.split('.')[0] for el in Path(src_dir).iterdir() if el.is_file()]

    return set(all_files)
    

def main(argv=None) -> set:
    train_size = 1000  # % of num-files...
    validation_size = 500  # next
    test_size = 500  # next

    if argv:
        if len(argv) != 3:
            print('Incomplete arguments, there must be 2 paths: resource and destination.')
            sys.exit(-1)

        src, dst = argv[1], argv[2]

    else:
        src, dst = '/media/denys/ftb/sprojects/trainCatsDogs/train/', '/media/denys/ftb/sprojects/datas/helps/data/'

    if Path(src).is_file() or Path(dst).is_file():
        print('Incorrect paths, there must be a folders.')
        sys.exit(-1)

    # images in src - all in one where filenames start with label (category)
    categories = identify_categories(src)
    create_dataset_directories(dst, categories)

    # if same extensions of images:
    ext = {0:el.suffix[1:] for el in Path(src).iterdir() if el.is_file()}.get(0, '')

    for category in categories:   # ! // \\ linux windows
        copy_data(src, f'{dst}/train/{category}/', category, ext, 0, train_size)
        copy_data(src, f'{dst}/validation/{category}/', category, ext, train_size, train_size + validation_size)
        copy_data(src, f'{dst}/test/{category}/', category, ext, train_size + validation_size, train_size + validation_size + test_size)

    return categories

if __name__ == '__main__':
    # main(sys.argv)

    categories = main()

Start creation of dataset directories...
Done preparing directories.
Copying... into /media/denys/ftb/sprojects/datas/helps/data//train/cat/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//validation/cat/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//test/cat/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//train/dog/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//validation/dog/...
Copying... into /media/denys/ftb/sprojects/datas/helps/data//test/dog/...


###### __ImageDataGenerator__

- https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator

In [10]:
from keras.preprocessing.image import ImageDataGenerator


train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

target_size = (150, 150)
batch_size = 20
# one of 'binary', 'categorical', 'input', 'multi_output', 'raw', 'sparse' or None. Default: 'categorical':
class_mode = 'binary' if len(categories) == 2 else 'categorical'

train_dir = '/media/denys/ftb/sprojects/datas/helps/data/train/'
validation_dir = '/media/denys/ftb/sprojects/datas/helps/data/validation/'
test_dir = '/media/denys/ftb/sprojects/datas/helps/data/test/'

train_generator = train_datagen.flow_from_directory(
                                                    train_dir,
                                                    target_size=target_size,
                                                    batch_size=batch_size,
                                                    class_mode=class_mode
                                                    )

validation_generator = validation_datagen.flow_from_directory(
                                                        validation_dir,
                                                        target_size=target_size,
                                                        batch_size=batch_size,  # 1 !
                                                        class_mode=class_mode
                                                        )

test_generator = test_datagen.flow_from_directory(
                                                  test_dir,
                                                  target_size=target_size,
                                                  batch_size=1,
                                                  class_mode=class_mode
                                                  )

Found 2000 images belonging to 2 classes.
Found 1000 images belonging to 2 classes.
Found 1000 images belonging to 2 classes.


In [11]:
train_generator, validation_generator, test_generator

(<keras.src.preprocessing.image.DirectoryIterator at 0x7f63706abc10>,
 <keras.src.preprocessing.image.DirectoryIterator at 0x7f63706abb80>,
 <keras.src.preprocessing.image.DirectoryIterator at 0x7f6370766290>)

###### save to file

In [25]:
generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
                                                                   train_dir,
                                                                   target_size=target_size,
                                                                   batch_size=1,
                                                                   class_mode=class_mode
                                                                   )

Found 2000 images belonging to 2 classes.


- https://pandas.pydata.org/docs/user_guide/merging.html

In [51]:
data_df = pd.DataFrame()
for i, (images, y_batch) in enumerate(generator):
    new_line = pd.DataFrame([[y_batch] + [images[0][a][b][c] for a in range(images[0].shape[0]) for b in range(images[0].shape[1]) for c in range(images[0].shape[2])]], index=[i])
    data_df = pd.concat([data_df, new_line])
    if i == 2:
        break
       
# pd.Dataframe.to_csv('data.csv', index=False)
data_df 

[0.]
[0.]
[1.]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67491,67492,67493,67494,67495,67496,67497,67498,67499,67500
0,[0.0],0.784314,0.784314,0.737255,0.784314,0.776471,0.729412,0.780392,0.764706,0.721569,...,0.031373,0.027451,0.035294,0.031373,0.027451,0.035294,0.031373,0.027451,0.035294,0.031373
1,[0.0],0.384314,0.372549,0.305882,0.368627,0.356863,0.290196,0.333333,0.321569,0.254902,...,0.807843,0.823529,0.823529,0.815686,0.890196,0.882353,0.894118,0.945098,0.937255,0.94902
2,[1.0],0.858824,0.843137,0.847059,0.870588,0.847059,0.854902,0.870588,0.847059,0.847059,...,0.407843,0.482353,0.490196,0.403922,0.482353,0.490196,0.403922,0.478431,0.486275,0.4


In [47]:
data_df.to_csv('/media/denys/ftb/sprojects/datas/helps/data/data.csv', index=False)

##### Loading `from files:`

In [None]:
from typing import Optional


def read_from_csvfile(file: Path, header: Optional[str]='infer') -> pd.DataFrame:
    """Read content from csv-file and return dataframe from content."""
    df = pd.read_csv(file, header=header)
    
    return df

In [50]:
# read_from_csvfile(item, None)

# 'https://drive.google.com/u/0/uc?id=1JMYqXipZpz9Y5-vyxvLEO2Y1sRBxqu-U&export=download'
# data = pd.read_csv('/media/denys/ftb/sprojects/datas/helps/data/sampleSubmission.csv')
data = pd.read_csv('/media/denys/ftb/sprojects/datas/helps/data/data.csv')
print(data.head(3), data.tail(3))

      0         1         2         3         4         5         6         7  \
0  [0.]  0.952941  0.949020  0.831373  0.964706  0.960784  0.850980  0.956863   
1  [1.]  0.352941  0.321569  0.313725  0.352941  0.321569  0.313725  0.356863   
2  [0.]  0.882353  0.850980  0.807843  0.886275  0.854902  0.811765  0.909804   

          8         9  ...     67491     67492     67493     67494     67495  \
0  0.949020  0.850980  ...  0.396078  0.800000  0.666667  0.388235  0.788235   
1  0.325490  0.317647  ...  0.203922  0.152941  0.184314  0.196078  0.113725   
2  0.870588  0.835294  ...  0.403922  0.207843  0.313725  0.396078  0.231373   

      67496     67497     67498     67499     67500  
0  0.654902  0.376471  0.772549  0.639216  0.360784  
1  0.145098  0.156863  0.164706  0.196078  0.207843  
2  0.333333  0.423529  0.223529  0.317647  0.419608  

[3 rows x 67501 columns]       0         1         2         3         4         5         6         7  \
0  [0.]  0.952941  0.949020  0.

In [61]:
y = data['0'].to_numpy()
# за замовчуванням axis=0, що означає роботу з рядками. Якщо вказати axis=1, то це дозволить видаляти стовпці:
data.drop(['0'], inplace=True, axis=1) 
data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,67491,67492,67493,67494,67495,67496,67497,67498,67499,67500
0,0.952941,0.94902,0.831373,0.964706,0.960784,0.85098,0.956863,0.94902,0.85098,0.960784,...,0.396078,0.8,0.666667,0.388235,0.788235,0.654902,0.376471,0.772549,0.639216,0.360784
1,0.352941,0.321569,0.313725,0.352941,0.321569,0.313725,0.356863,0.32549,0.317647,0.356863,...,0.203922,0.152941,0.184314,0.196078,0.113725,0.145098,0.156863,0.164706,0.196078,0.207843
2,0.882353,0.85098,0.807843,0.886275,0.854902,0.811765,0.909804,0.870588,0.835294,0.917647,...,0.403922,0.207843,0.313725,0.396078,0.231373,0.333333,0.423529,0.223529,0.317647,0.419608


##### Loading `from Link:`

In [13]:
p1t1 = pd.read_html(
                    io='https://uk.wikipedia.org/wiki/%D0%9D%D0%B0%D1%81%D0%B5%D0%BB%D0%B5%D0%BD%D0%BD%D1%8F_%D0%A3%D0%BA%D1%80%D0%B0%D1%97%D0%BD%D0%B8',
                    match='Коефіцієнт народжуваності в регіонах',
                    )
p1t1

[               Регіон   1950   1960   1970  1990  2000  2012 2014 2019
 0                Крим  230.0  206.0  160.0   130    73   126    —    —
 1           Вінницька  224.0  192.0  142.0   124    84   112  109   76
 2           Волинська  247.0  250.0  179.0   153   112   148  141  101
 3    Дніпропетровська  204.0  204.0  151.0   123    71   112  111   71
 4            Донецька  271.0  214.0  140.0   109    61    98   82    —
 5         Житомирська  261.0  223.0  159.0   129    89   122  120   79
 6        Закарпатська  314.0  273.0  207.0   168   115   151  146  104
 7          Запорізька  219.0  197.0  150.0   124    71   106  106   68
 8   Івано-Франківська  243.0  248.0  182.0   155   103   124  122   88
 9            Київська  204.0  189.0  156.0   123    73   122  121   80
 10     Кіровоградська  216.0  171.0  145.0   126    79   110  108   68
 11          Луганська  262.0  235.0  144.0   116    62    96   51    —
 12          Львівська  234.0  240.0  171.0   140    91   119  1

In [None]:
# ...

### __`MODEL`__

- https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
- https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint

In [None]:
callback_early_stopping = EarlyStopping(
                                        monitor='val_categorical_accuracy',  # val_loss
                                        patience=4,
                                        )

callback_save = ModelCheckpoint(
                                'best.hdf5',  # name to save the resulting model
                                monitor='val_loss',
                                save_best_only=True,
                                )

#### __`Training`__

In [None]:
history = model.fit(
                    train_dataset,
                    validation_data=valid_dataset,
                    epochs=20,
                    verbose=1, # print logs
                    callbacks=[callback_early_stopping, callback_save]  # callback_save
                    )

In [None]:
history = model.fit(
                    train_datagen,
                    steps_per_epoch=100,
                    epochs=100,
                    validation_data=valid_datagen,
                    validation_steps=100
                    )

- https://stackoverflow.com/questions/62836066/infinite-loop-with-imagedatagenerator

In [None]:
history = model.fit(
                    train_generator,
                    steps_per_epoch=100,
                    epochs=30,
                    validation_data=validation_generator,
                    validation_steps=50
                    )

### __`INTERPRET`__ / __`Error Analysis`__

### __`RE-TEST`__

### __`RESULTS` & `CONCLUSIONS`__