In [1]:
import numpy as np
import pandas as pd

# import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os, fnmatch
from os import listdir, path as os_path
import pickle
import shutil
from collections import Counter
from tqdm import tqdm

from _helpers import make_directory

from sklearn.cluster import KMeans
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression


from PIL import Image
import cv2

from tensorflow import device as tf_device
from tensorflow.keras import optimizers, regularizers
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Activation, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array
from tensorflow.keras.utils import to_categorical

In [2]:
csv_name = 'Subtypes'
setting = 'Lets combine!'
main_path = f'data/Extracted/{csv_name}/{setting}/images_classified_raw'

In [3]:
# TRAIN_TEST_SPLIT   (RUN ONLY IF images_classified_raw folder contains only the "out" folder)

all_data_path = f'{main_path}/out'

train_path = os_path.join(main_path, 'train')
test_path = os_path.join(main_path, 'test')

make_directory(train_path)
make_directory(test_path)

train_entries, test_entries = train_test_split(listdir(all_data_path), test_size=0.15, shuffle=True)

for entry in train_entries:
    entry_path = os_path.join(all_data_path, entry)
    entry_output_path = os_path.join(train_path, entry)
    shutil.copyfile(entry_path, entry_output_path)

for entry in test_entries:
    entry_path = os_path.join(all_data_path, entry)
    entry_output_path = os_path.join(test_path, entry)
    shutil.copyfile(entry_path, entry_output_path)

In [4]:
train_path = os.path.join(main_path, 'train')
data_train = []
data_train_names = os.listdir(train_path)
pattern = "*.tiff"
for entry in data_train_names:
    if fnmatch.fnmatch(entry, pattern):
            data_train.append(os.path.join(train_path, entry))

In [5]:
print(len(data_train_names), len(data_train))

1865 1865


In [6]:
test_path = os.path.join(main_path, 'test')
data_test = []
data_test_names = os.listdir(test_path)
pattern = "*.tiff"
for entry in data_test_names:
    if fnmatch.fnmatch(entry, pattern):
            data_test.append(os.path.join(test_path, entry))

In [7]:
print(len(data_test_names), len(data_test))

330 330


In [8]:
data_train_list = []
for i in range(len(data_train)):
    arr_obj = []
    cl = (data_train[i].split("/")[-1]).split("~~~")[0]
#     if cl == "Sy1" or cl == "cv" or cl == "WD" or cl == "QSO": continue
    if cl not in {'sdB', 'C-H', 'Mrk SB', 'C Ba', 'sdO', 'sdA'}: continue
    arr_obj.append(cl)
    arr_obj.append(data_train[i])
    data_train_list.append(arr_obj)
    
data_test_list = []
for i in range(len(data_test)):
    arr_obj = []
    cl = (data_test[i].split("/")[-1]).split("~~~")[0]
#     if cl == "Sy1" or cl == "cv" or cl == "WD" or cl == "QSO": continue
    if cl not in {'sdB', 'C-H', 'Mrk SB', 'C Ba', 'sdO', 'sdA'}: continue
    arr_obj.append(cl)
    arr_obj.append(data_test[i])
    data_test_list.append(arr_obj)

In [9]:
print(len(data_train_list), len(data_test_list))

1668 297


In [10]:
df_train = pd.DataFrame(data_train_list)
df_train['fname'] = df_train[1].str.split('/', expand=True, ).iloc[:,-1]
df_train.rename(columns={1: "path", 0: "Cl"}, inplace=True)
df_test = pd.DataFrame(data_test_list)
df_test['fname'] = df_test[1].str.split('/', expand=True, ).iloc[:,-1]
df_test.rename(columns={1: "path", 0: "Cl"}, inplace=True)

In [11]:
df_train.head()

Unnamed: 0,Cl,path,fname
0,Mrk SB,data/Extracted/Subtypes/Lets combine!/images_c...,Mrk SB~~~1285__354.tiff
1,C Ba,data/Extracted/Subtypes/Lets combine!/images_c...,C Ba~~~22__J025658.37+332608.6.tiff
2,C-H,data/Extracted/Subtypes/Lets combine!/images_c...,C-H~~~464__J132626.19+192957.1.tiff
3,C-H,data/Extracted/Subtypes/Lets combine!/images_c...,C-H~~~574__J233418.64+200250.9.tiff
4,sdO,data/Extracted/Subtypes/Lets combine!/images_c...,sdO~~~3026__PG1204+543.tiff


In [12]:
df_test.head()

Unnamed: 0,Cl,path,fname
0,sdA,data/Extracted/Subtypes/Lets combine!/images_c...,sdA~~~1949__SDSSJ112504.73+671658.3.tiff
1,C-H,data/Extracted/Subtypes/Lets combine!/images_c...,C-H~~~267__J144814.56+024352.7.tiff
2,C Ba,data/Extracted/Subtypes/Lets combine!/images_c...,C Ba~~~240__J092132.77+072133.8.tiff
3,sdO,data/Extracted/Subtypes/Lets combine!/images_c...,sdO~~~2880__GALEXJ05557+6408.tiff
4,Mrk SB,data/Extracted/Subtypes/Lets combine!/images_c...,Mrk SB~~~1233__1208.tiff


In [13]:
def make_directory(path):
    folders = path.split('/')
    current_path = ''
    for folder in folders[:-1]:
        current_path += folder + '/' 
        try:
            os.mkdir(current_path)
        except OSError as error:
            pass
    current_path += folders[-1]
    shutil.rmtree(current_path, ignore_errors=True)
    os.mkdir(current_path)

def augment(data, n_times=1, batch_size=2, img_size=(140,20), input_path='data/images/',
            output_path='data/augmented/', seed = None, save_format='png', x_col='fname', y_col="Cl",
            shuffle=False, color_mode='grayscale', class_mode="categorical"):
    n_steps_data_aug = np.ceil(data.shape[0]/batch_size).astype(int)
    save_prefix = 'aug'
    
    datagen = ImageDataGenerator(
        rotation_range=1,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=False,
        fill_mode="nearest"
        # rescale = 1./0xff
    )
    
    aug_gen = datagen.flow_from_dataframe(dataframe=data, directory=input_path,
                                          save_to_dir=output_path, save_prefix=save_prefix,
                                          save_format=save_format, x_col=x_col, y_col=y_col,
                                          batch_size=batch_size, seed=seed,
                                          shuffle=shuffle, color_mode=color_mode,
                                          class_mode=class_mode, target_size=img_size)

    make_directory(output_path)

    for i in tqdm(range(n_times*n_steps_data_aug)):
        next(aug_gen)

    augmented_images = np.array(os.listdir(output_path))
    aug_data = pd.concat([pd.Series(augmented_images).str.split('_', expand=True)[1], output_path + pd.Series(augmented_images)], axis=1)

    aug_data[y_col] = data[y_col].iloc[aug_data[1]].values
    aug_data[1] = data.iloc[aug_data[1]].index
    aug_data.rename(columns={0: "path", 1: "data_index"}, inplace=True)

    return aug_data, aug_gen.classes

In [14]:
img_size = (140, 20)
train_out_path = os.path.join(main_path, 'augmented/train/')
test_out_path = os.path.join(main_path, 'augmented/test/') 

In [15]:
aug_train, aug_classes = augment(df_train, n_times=8, batch_size=4, img_size=img_size,
                                 input_path=train_path, output_path=train_out_path,
                                 seed=None, save_format='tiff')
aug_test, _ = augment(df_test, n_times=2, batch_size=4, img_size=img_size, input_path=test_path,
                      output_path=test_out_path, seed=None, save_format='tiff')
aug_train.head()

Found 1668 validated image filenames belonging to 6 classes.


100%|████████████████████████████████████████████████████████████████████████| 3336/3336 [00:10<00:00, 315.21it/s]


Found 297 validated image filenames belonging to 6 classes.


100%|██████████████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 285.36it/s]


Unnamed: 0,data_index,path,Cl
0,731,data/Extracted/Subtypes/Lets combine!/images_c...,sdA
1,1039,data/Extracted/Subtypes/Lets combine!/images_c...,Mrk SB
2,231,data/Extracted/Subtypes/Lets combine!/images_c...,C-H
3,128,data/Extracted/Subtypes/Lets combine!/images_c...,Mrk SB
4,410,data/Extracted/Subtypes/Lets combine!/images_c...,sdB


In [16]:
df_train = aug_train
df_test = aug_test

In [17]:
aug_train.head()

Unnamed: 0,data_index,path,Cl
0,731,data/Extracted/Subtypes/Lets combine!/images_c...,sdA
1,1039,data/Extracted/Subtypes/Lets combine!/images_c...,Mrk SB
2,231,data/Extracted/Subtypes/Lets combine!/images_c...,C-H
3,128,data/Extracted/Subtypes/Lets combine!/images_c...,Mrk SB
4,410,data/Extracted/Subtypes/Lets combine!/images_c...,sdB


In [18]:
df_train['Cl'].value_counts()

sdB       4272
C-H       3232
Mrk SB    3080
C Ba      1024
sdA        880
sdO        856
Name: Cl, dtype: int64

In [19]:
df_test['Cl'].value_counts()

sdB       160
Mrk SB    148
C-H       138
C Ba       62
sdO        52
sdA        34
Name: Cl, dtype: int64

In [20]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_train['Cl'])
df_train['Cl']=le.transform(df_train['Cl'])
df_test['Cl']=le.transform(df_test['Cl'])

In [21]:
df_train['Cl'].value_counts()

4    4272
1    3232
2    3080
0    1024
3     880
5     856
Name: Cl, dtype: int64

In [22]:
df_test['Cl'].value_counts()

4    160
2    148
1    138
0     62
5     52
3     34
Name: Cl, dtype: int64

In [23]:
X_tr = df_train['path']
Y_tr = df_train['Cl']
X_tr = X_tr.values
Y_tr = Y_tr.values

X_ts = df_test['path']
Y_ts = df_test['Cl']
X_ts = X_ts.values
Y_ts = Y_ts.values

In [24]:
images_list_train = []
max_width = 0
max_height = 0
ind = 0
for i in range(len(X_tr)):
    im = Image.open(X_tr[i])
    arr = np.array(im)
    
    arr=(arr-arr.min())/(arr.max()-arr.min())

    if arr.mean() > 0.5:
        arr = 1 - arr

    s = arr.shape
    if s[0] > max_height:
        max_height = s[0]
    if s[1] > max_width:
        max_width = s[1]
        ind = i
    images_list_train.append(arr)
    
images_list_test = []
ind = 0
for i in range(len(X_ts)):
    im = Image.open(X_ts[i])
    arr = np.array(im)
    
    arr=(arr-arr.min())/(arr.max()-arr.min())

    if arr.mean() > 0.5:
        arr = 1 - arr
    
    s = arr.shape
    if s[0] > max_height:
        max_height = s[0]
    if s[1] > max_width:
        max_width = s[1]
        ind = i
    images_list_test.append(arr)

In [25]:
max_height, max_width

(140, 20)

In [26]:
max_width = 50
max_height = 160
max_width = 32
max_height = 144

In [27]:
print(max_width, max_height)

32 144


In [28]:
for i in range(len(images_list_train)):
    s = images_list_train[i].shape
    d_width = (max_width - s[1])
    d_height = (max_height - s[0])
    
    d_top = int(d_height / 2)
    d_bottom = int(d_height - d_top)
    
    d_left = int(d_width / 2)
    d_right = int(d_width - d_left)
    #print(d_top, d_bottom, d_left, d_right)
    
    arr = images_list_train[i]
    for l in range(d_left):
        arr = np.insert(arr, 0, 0, axis = 1)
    
    for r in range(d_right):
        b = np.zeros((s[0],1))
        arr = np.append(arr, b, axis = 1)
    
    for t in range(d_top):
        arr = np.insert(arr, 0, 0, axis = 0)
    
    for b in range(d_bottom):
        b = np.zeros((1, arr.shape[1],))
        arr = np.append(arr, b, axis = 0)
    
    images_list_train[i] = arr.flatten()

for i in range(len(images_list_test)):
    s = images_list_test[i].shape
    d_width = (max_width - s[1])
    d_height = (max_height - s[0])
    
    d_top = int(d_height / 2)
    d_bottom = int(d_height - d_top)
    
    d_left = int(d_width / 2)
    d_right = int(d_width - d_left)
    #print(d_top, d_bottom, d_left, d_right)
    
    arr = images_list_test[i]
    for l in range(d_left):
        arr = np.insert(arr, 0, 0, axis = 1)
    
    for r in range(d_right):
        b = np.zeros((s[0],1))
        arr = np.append(arr, b, axis = 1)
    
    for t in range(d_top):
        arr = np.insert(arr, 0, 0, axis = 0)
    
    for b in range(d_bottom):
        b = np.zeros((1, arr.shape[1],))
        arr = np.append(arr, b, axis = 0)
    
    images_list_test[i] = arr.flatten()

In [29]:
images_np_train = np.array(images_list_train)
images_np_test = np.array(images_list_test)

In [30]:
class_count = len(df_train['Cl'].unique())
X_train = images_np_train
Y_train = Y_tr
Y_train = to_categorical(Y_train, class_count)

X_test = images_np_test
Y_test = Y_ts
Y_test = to_categorical(Y_test, class_count)

In [31]:
X_train = X_train.reshape(X_train.shape[0], max_height, max_width, 1)
X_test = X_test.reshape(X_test.shape[0], max_height, max_width, 1)
input_shape = (max_height, max_width, 1)

In [32]:
print('x_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

x_train shape: (13344, 144, 32, 1)
13344 train samples
594 test samples


In [33]:
X_train_3 = np.concatenate([X_train, X_train, X_train], axis=-1)
X_test_3 = np.concatenate([X_test, X_test, X_test], axis=-1)

In [34]:
X_train_3.shape, X_test_3.shape, (input_shape[0], input_shape[1], 3)

((13344, 144, 32, 3), (594, 144, 32, 3), (144, 32, 3))

In [35]:
tl = DenseNet121(include_top=False, weights='imagenet', input_shape=(input_shape[0], input_shape[1], 3), pooling=None)
# tl.summary()

2022-09-24 16:18:19.343723: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2022-09-24 16:18:19.343754: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: mlgod-GF63-Thin-11UC
2022-09-24 16:18:19.343759: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: mlgod-GF63-Thin-11UC
2022-09-24 16:18:19.343991: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.141.3
2022-09-24 16:18:19.344006: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.141.3
2022-09-24 16:18:19.344010: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.141.3
2022-09-24 16:18:19.344703: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions

In [36]:
for layer in tl.layers:
    layer.trainable = False

In [37]:
modTL = Sequential()
modTL.add(Input(shape=(input_shape[0], input_shape[1], 3)))
# modTL.add(Conv2D(filters=3, kernel_size=(1,1), activation='relu', padding='same'))
modTL.add(tl)
modTL.add(Flatten())
modTL.add(Dense(128, activation='relu'))
modTL.add(Dropout(0.5))
modTL.add(Dense(class_count,activation='softmax'))
modTL.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 densenet121 (Functional)    (None, 4, 1, 1024)        7037504   
                                                                 
 flatten (Flatten)           (None, 4096)              0         
                                                                 
 dense (Dense)               (None, 128)               524416    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 6)                 774       
                                                                 
Total params: 7,562,694
Trainable params: 525,190
Non-trainable params: 7,037,504
_________________________________________________________________


In [38]:
adadelta = optimizers.Adadelta(learning_rate=0.01, rho=0.95)
modTL.compile(optimizer='adamax' , loss='categorical_crossentropy', metrics=['accuracy'])

In [39]:
checkpoint_directory = f'data/Checkpoints/{csv_name}/{setting}/'

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_directory + 'checkpoint',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

make_directory(checkpoint_directory)

def lr_schedule(epoch):
    lrate = 0.002
    if epoch > 3:
        lrate = 0.00001
    if epoch > 6:
        lrate = 0.000005
    if epoch > 9:
        lrate = 0.0000025
    if epoch > 12:
        lrate = 0.000001
    if epoch > 15:
        lrate = 0.0000005
    return lrate

lr_scheduler = LearningRateScheduler(lr_schedule)

cw = {0:3, 1:4, 2:3, 3:4, 4:3}

# counter = Counter(aug_classes)
# max_val = float(max(counter.values()))       
# cw = {class_id : max_val/num_images for class_id, num_images in counter.items()}

with tf_device('GPU:0'):
    history = modTL.fit(X_train_3, Y_train, epochs = 10, batch_size = 1, shuffle=True, validation_data=(X_test_3, Y_test), class_weight=cw, callbacks=[model_checkpoint_callback])
#     Learning rate scheduler
#     lr_scheduler

2022-09-24 16:18:33.524390: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 737869824 exceeds 10% of free system memory.


Epoch 1/10
   25/13344 [..............................] - ETA: 3:57 - loss: 15.9540 - accuracy: 0.2000 

InvalidArgumentError:  indices[0] = 5 is not in [0, 5)
	 [[{{node GatherV2}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_19630]

In [None]:
with tf_device('GPU:0'):
    history_TL1 = modTL.fit(X_train, Y_train, epochs = 10, batch_size = 1, shuffle=True, validation_data=(X_test, Y_test), class_weight=cw, callbacks=[model_checkpoint_callback, lr_scheduler])

In [None]:
tl_network = MobileNetV2(include_top=False, weights=None, input_shape=input_shape, pooling='max')
tl_network.summary()

In [None]:
# EfB7 = EfficientNetB7(include_top=False, weights=None, input_shape=input_shape, pooling='max')
# EfB7.summary()

In [None]:
modB7 = Sequential()
modB7.add(Input(input_shape))
modB7.add(tl_network)
modB7.add(Dense(class_count,activation='softmax'))
modB7.summary()

In [None]:
adadelta = optimizers.Adadelta(learning_rate=0.01, rho=0.95)
modB7.compile(optimizer='adamax' , loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
checkpoint_directory = f'data/Checkpoints/{csv_name}/{setting}/'

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_directory + 'checkpoint',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

make_directory(checkpoint_directory)

In [None]:
cw = {0:3, 1:4, 2:3, 3:4, 4:3}

# counter = Counter(aug_classes)
# max_val = float(max(counter.values()))       
# cw = {class_id : max_val/num_images for class_id, num_images in counter.items()}

with tf_device('GPU:0'):
    history = modB7.fit(X_train, Y_train, epochs = 10, batch_size = 1, shuffle=True, validation_data=(X_test, Y_test), class_weight=cw, callbacks=[model_checkpoint_callback, lr_scheduler])
#     Learning rate scheduler
# 

In [50]:
model = Sequential()
model.add(Conv2D(128, kernel_size=(3,3), input_shape=input_shape, padding="same"))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3), padding="same"))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(1, 2)))

model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 1)))

model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 1)))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(class_count,activation='softmax'))

In [51]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 144, 32, 128)      1280      
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 72, 16, 128)      0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 72, 16, 128)       147584    
                                                                 
 activation_8 (Activation)   (None, 72, 16, 128)       0         
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 72, 8, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_12 (Conv2D)          (None, 72, 8, 64)        

In [52]:
adadelta = optimizers.Adadelta(learning_rate=0.01, rho=0.95)
model.compile(optimizer='adam' , loss='categorical_crossentropy', metrics=['accuracy'])

In [53]:
checkpoint_directory = f'data/Checkpoints/{csv_name}/{setting}/'

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_directory + 'checkpoint',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

make_directory(checkpoint_directory)

def lr_schedule(epoch):
    lrate = 0.002
    if epoch > 3:
        lrate = 0.001
    if epoch > 6:
        lrate = 0.0005
    if epoch > 9:
        lrate = 0.00025
    if epoch > 12:
        lrate = 0.0001
    if epoch > 15:
        lrate = 0.00005
    return lrate

lr_scheduler = LearningRateScheduler(lr_schedule)

In [None]:
#cw = {0:1, 1:1, 2:1, 3:1, 4:1}
# cw = {0:3, 1:4, 2:3, 3:4, 4:3}

counter = Counter(aug_classes)
max_val = float(max(counter.values()))       
cw = {class_id : max_val/num_images for class_id, num_images in counter.items()}

with tf_device('GPU:0'):
    history = model.fit(X_train, Y_train, epochs = 35, batch_size = 8, shuffle=True, validation_data=(X_test, Y_test), class_weight=cw, callbacks=[model_checkpoint_callback])
#     Learning rate scheduler
#     history = model.fit(X_train, Y_train, epochs = 20, batch_size = 8, shuffle=True, validation_data=(X_test, Y_test), class_weight=cw, callbacks=[model_checkpoint_callback, lr_scheduler])

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35

In [None]:
with tf_device('GPU:0'):
    history_1 = model.fit(X_train, Y_train, epochs = 15, batch_size = 8, shuffle=True, validation_data=(X_test, Y_test), class_weight=cw, callbacks=[model_checkpoint_callback])

In [None]:
plt.plot(history.history['accuracy'], label='Accuracy (train data)', linewidth=3, markersize=12)
plt.plot(history.history['val_accuracy'], label='Accuracy (validation data)', linewidth=3, markersize=12)
plt.plot(history.history['loss'], label='loss (train data)', linewidth=3, markersize=12)
plt.plot(history.history['val_loss'], label='loss (validation data)', linewidth=3, markersize=12)
plt.title('Loss and Accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.rcParams["figure.figsize"] = (20,15)
plt.show()

In [None]:
from sklearn.metrics import classification_report

y_test = np.argmax(Y_test, axis=1) # Convert one-hot to index
y_pred = model.predict(X_test)
print(classification_report(Y_test, to_categorical(np.argmax(y_pred, axis=1))))

In [None]:
model.load_weights(checkpoint_directory + 'checkpoint')

In [None]:
from sklearn.metrics import classification_report

y_test = np.argmax(Y_test, axis=1) # Convert one-hot to index
y_pred = model.predict(X_test)
print(classification_report(Y_test, to_categorical(np.argmax(y_pred, axis=1))))

In [None]:
train_path = 'data/Existing/FinalDataset/train/'
train_path = f'data/Extracted/{csv_name}/{setting}/images_classified_raw/train/'
data_train = []
data_train_names = os.listdir(train_path)
pattern = "*.tiff"
for entry in data_train_names:
    if fnmatch.fnmatch(entry, pattern):
            data_train.append(train_path+entry)

In [None]:
test_path = 'data/Existing/FinalDataset/test/'
test_path = f'data/Extracted/{csv_name}/{setting}/images_classified_raw/test/'
data_test = []
data_test_names = os.listdir(test_path)
pattern = "*.tiff"
for entry in data_test_names:
    if fnmatch.fnmatch(entry, pattern):
            data_test.append(test_path+entry)

In [None]:
data_train_list = []
for i in range(len(data_train)):
    arr_obj = []
    cl = (data_train[i].split("/")[-1]).split("~~~")[0]
    if cl == "Sy1" or cl == "cv" or cl == "WD" or cl == "QSO": continue
    arr_obj.append(cl)
    arr_obj.append(data_train[i])
    data_train_list.append(arr_obj)
    
data_test_list = []
for i in range(len(data_test)):
    arr_obj = []
    cl = (data_test[i].split("/")[-1]).split("~~~")[0]
    if cl == "Sy1" or cl == "cv" or cl == "WD" or cl == "QSO": continue
    arr_obj.append(cl)
    arr_obj.append(data_test[i])
    data_test_list.append(arr_obj)

In [None]:
train_df = pd.DataFrame(data_train_list)
train_df['fname'] = train_df[1].str.split('/', expand=True, ).iloc[:,-1]
train_df.rename(columns={1: "path", 0: "Cl"}, inplace=True)
test_df = pd.DataFrame(data_test_list)
test_df['fname'] = test_df[1].str.split('/', expand=True, ).iloc[:,-1]
test_df.rename(columns={1: "path", 0: "Cl"}, inplace=True)

In [None]:
train_df['fname'] = train_df['fname'].str.split('~~~', expand=True).iloc[:,-1]
test_df['fname'] = test_df['fname'].str.split('~~~', expand=True).iloc[:,-1]

In [None]:
train_df.head()

In [None]:
train_df['name'] = train_df['fname'].str.split('__', expand=True).iloc[:,-1]
test_df['name'] = test_df['fname'].str.split('__', expand=True).iloc[:,-1]
train_df.head()

In [None]:
extracted_path = f'data/Extracted/{csv_name}/{setting}/images/'
extracted_train_path = 'data/Existing/FinalDataset/train/'
extracted_test_path = 'data/Existing/FinalDataset/test/'

In [None]:
extracted = pd.Series(listdir(extracted_path))
extracted_train = pd.Series(listdir(extracted_train_path))
extracted_test = pd.Series(listdir(extracted_test_path))

extracted.head()
extracted_train.head()

In [None]:
extracted_train.name = 'fullName'
extracted_test.name = 'fullName'

In [None]:
extracted_train = pd.concat([extracted_train, extracted_train.str.split('~~~', expand=True)], axis=1)
extracted_test = pd.concat([extracted_test, extracted_test.str.split('~~~', expand=True)], axis=1)
extracted_df = pd.concat([extracted_path + extracted, extracted], axis=1)
extracted_df.head()
extracted_train.head()

In [None]:
extracted_df.rename(columns={0:'path', 1:'fname'}, inplace=True)
extracted_train.rename(columns={0:'Cl', 1:'name'}, inplace=True)
extracted_test.rename(columns={0:'Cl', 1:'name'}, inplace=True)
extracted_df.head()
extracted_train.head()

In [None]:
cl_name = extracted_df['fname'].str.split('__', expand=True).values
extracted_df['Cl'], extracted_df['name'] = cl_name[:,0], cl_name[:,1]

extracted_train['path'] = extracted_train_path + extracted_train['fullName']
extracted_test['path'] = extracted_test_path + extracted_test['fullName']

extracted_df.head()
extracted_train.head()

In [None]:
data = pd.read_csv('data/Datasets/Combined.csv')
data.head()

In [None]:
extracted_df['Cl'] = data.iloc[extracted_df['Cl']]['Cl'].values
extracted_df.head()

In [None]:
extracted_df['Cl'].value_counts()
extracted_train['Cl'].value_counts(), extracted_test['Cl'].value_counts()

In [None]:
extracted_df.drop(extracted_df[(extracted_df['Cl'] == 'cv') | (extracted_df['Cl'] == 'QSO') | (extracted_df['Cl'] == 'WD') | (extracted_df['Cl'] == 'Sy1')].index, inplace=True)

In [None]:
extracted_df['Cl'].value_counts()

In [None]:
extracted_df.head()
extracted_train.head()

In [None]:
train_ext = []
test_ext = []
ext = []

for row, value in extracted_df.iterrows():
    rows = train_df[train_df['fname'] == value['name']]
    if rows.shape[0]:
        train_ext.append((value['path'], value['name'], value['Cl']))
        continue
    rows = test_df[test_df['fname'] == value['name']]
    if rows.shape[0]:
        test_ext.append((value['path'], value['name'], value['Cl']))
        continue
    ext.append((value['path'], value['name'], value['Cl']))

In [None]:
train = []
test = []
unseen = []

In [None]:
for row, value in extracted_train.iterrows():
    rows = train_df[train_df['name'] == value['name']]
    if rows.shape[0]:
        train.append((value['path'], value['fullName'], value['Cl']))
        continue
    rows = test_df[test_df['name'] == value['name']]
    if rows.shape[0]:
        test.append((value['path'], value['fullName'], value['Cl']))
        continue
    unseen.append((value['path'], value['fullName'], value['Cl']))

In [None]:
for row, value in extracted_test.iterrows():
    rows = train_df[train_df['name'] == value['name']]
    if rows.shape[0]:
        train.append((value['path'], value['fullName'], value['Cl']))
        continue
    rows = test_df[test_df['name'] == value['name']]
    if rows.shape[0]:
        test.append((value['path'], value['fullName'], value['Cl']))
        continue
    unseen.append((value['path'], value['fullName'], value['Cl']))

In [None]:
len(train_ext), len(test_ext), len(ext)

In [None]:
len(train), len(test), len(unseen)

In [None]:
train_ext = pd.DataFrame(train_ext, columns=['path', 'name', 'Cl'])
test_ext = pd.DataFrame(test_ext, columns=['path', 'name', 'Cl'])
ext = pd.DataFrame(ext, columns=['path', 'name', 'Cl'])

train = pd.DataFrame(train, columns=['path', 'name', 'Cl'])
test = pd.DataFrame(test, columns=['path', 'name', 'Cl'])
unseen = pd.DataFrame(unseen, columns=['path', 'name', 'Cl'])

In [None]:
train_ext.head()
train.head()

In [None]:
train['Cl'].value_counts(), test['Cl'].value_counts(), unseen['Cl'].value_counts()

In [None]:
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# le.fit(train_ext['Cl'])
# train_ext['Cl']=le.transform(train_ext['Cl'])
# test_ext['Cl']=le.transform(test_ext['Cl'])
# ext['Cl'] = le.transform(ext['Cl'])

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train['Cl'])
train['Cl']=le.transform(train['Cl'])
test['Cl']=le.transform(test['Cl'])
unseen['Cl'] = le.transform(unseen['Cl'])

In [None]:
# X_tr = train_ext['path']
# Y_tr = train_ext['Cl']
# X_tr = X_tr.values
# Y_tr = Y_tr.values

# X_ts = test_ext['path']
# Y_ts = test_ext['Cl']
# X_ts = X_ts.values
# Y_ts = Y_ts.values

# X_e = ext['path']
# Y_e = ext['Cl']
# X_e = X_e.values
# Y_e = Y_e.values

In [None]:
X_tr = train['path']
Y_tr = train['Cl']
X_tr = X_tr.values
Y_tr = Y_tr.values

X_ts = test['path']
Y_ts = test['Cl']
X_ts = X_ts.values
Y_ts = Y_ts.values

X_e = unseen['path']
Y_e = unseen['Cl']
X_e = X_e.values
Y_e = Y_e.values

In [None]:
il_train = []
max_width = 0
max_height = 0
ind = 0
for i in range(len(X_tr)):
    im = Image.open(X_tr[i])
    arr = np.array(im)
    arr = np.flipud(arr)
    
    arr=(arr-arr.min())/(arr.max()-arr.min())

    if arr.mean() > 0.5:
        arr = 1 - arr

    s = arr.shape
    if s[0] > max_height:
        max_height = s[0]
    if s[1] > max_width:
        max_width = s[1]
        ind = i
    il_train.append(arr)
    
il_test = []
ind = 0
for i in range(len(X_ts)):
    im = Image.open(X_ts[i])
    arr = np.array(im)
    arr = np.flipud(arr)
    
    arr=(arr-arr.min())/(arr.max()-arr.min())

    if arr.mean() > 0.5:
        arr = 1 - arr

    s = arr.shape
    if s[0] > max_height:
        max_height = s[0]
    if s[1] > max_width:
        max_width = s[1]
        ind = i
    il_test.append(arr)
    
il_e = []
ind = 0
for i in range(len(X_e)):
    im = Image.open(X_e[i])
    arr = np.array(im)
    arr = np.flipud(arr)
    
    arr=(arr-arr.min())/(arr.max()-arr.min())

    if arr.mean() > 0.5:
        arr = 1 - arr

    s = arr.shape
    if s[0] > max_height:
        max_height = s[0]
    if s[1] > max_width:
        max_width = s[1]
        ind = i
    il_e.append(arr)

In [None]:
max_width = 50
max_height = 160

In [None]:
for i in range(len(il_train)):
    s = il_train[i].shape
    d_width = (max_width - s[1])
    d_height = (max_height - s[0])
    
    d_top = int(d_height / 2)
    d_bottom = int(d_height - d_top)
    
    d_left = int(d_width / 2)
    d_right = int(d_width - d_left)
    
    arr = il_train[i]
    for l in range(d_left):
        arr = np.insert(arr, 0, 0, axis = 1)
    
    for r in range(d_right):
        b = np.zeros((s[0],1))
        arr = np.append(arr, b, axis = 1)
    
    for t in range(d_top):
        arr = np.insert(arr, 0, 0, axis = 0)
    
    for b in range(d_bottom):
        b = np.zeros((1, arr.shape[1],))
        arr = np.append(arr, b, axis = 0)
    
    il_train[i] = arr.flatten()

for i in range(len(il_test)):
    s = il_test[i].shape
    d_width = (max_width - s[1])
    d_height = (max_height - s[0])
    
    d_top = int(d_height / 2)
    d_bottom = int(d_height - d_top)
    
    d_left = int(d_width / 2)
    d_right = int(d_width - d_left)
    
    arr = il_test[i]
    for l in range(d_left):
        arr = np.insert(arr, 0, 0, axis = 1)

    for r in range(d_right):
        b = np.zeros((s[0],1))
        arr = np.append(arr, b, axis = 1)

    for t in range(d_top):
        arr = np.insert(arr, 0, 0, axis = 0)

    for b in range(d_bottom):
        b = np.zeros((1, arr.shape[1],))
        arr = np.append(arr, b, axis = 0)

    il_test[i] = arr.flatten()

for i in range(len(il_e)):
    s = il_e[i].shape
    d_width = (max_width - s[1])
    d_height = (max_height - s[0])
    
    d_top = int(d_height / 2)
    d_bottom = int(d_height - d_top)
    
    d_left = int(d_width / 2)
    d_right = int(d_width - d_left)
    
    arr = il_e[i]
    for l in range(d_left):
        arr = np.insert(arr, 0, 0, axis = 1)
    
    for r in range(d_right):
        b = np.zeros((s[0],1))
        arr = np.append(arr, b, axis = 1)
    
    for t in range(d_top):
        arr = np.insert(arr, 0, 0, axis = 0)
    
    for b in range(d_bottom):
        b = np.zeros((1, arr.shape[1],))
        arr = np.append(arr, b, axis = 0)
    
    il_e[i] = arr.flatten()

In [None]:
inp_train = np.array(il_train)
inp_test = np.array(il_test)
inp_e = np.array(il_e)

In [None]:
class_count = 5
X_train1 = inp_train
Y_train1 = Y_tr
Y_train1 = to_categorical(Y_train1, class_count)

X_test1 = inp_test
Y_test1 = Y_ts
Y_test1 = to_categorical(Y_test1, class_count)

X_e1 = inp_e
Y_e1 = Y_e
Y_e1 = to_categorical(Y_e1, class_count)

In [None]:
X_train1 = X_train1.reshape(X_train1.shape[0], max_height, max_width, 1)
X_test1 = X_test1.reshape(X_test1.shape[0], max_height, max_width, 1)
X_e1 = X_e1.reshape(X_e1.shape[0], max_height, max_width, 1)
input_shape = (max_height, max_width, 1)

In [None]:
print('x_train shape:', X_train1.shape)
print(X_train1.shape[0], 'train samples')
print(X_test1.shape[0], 'test samples')
print(X_e1.shape[0], 'ext samples')

In [None]:
model.evaluate(X_train1, Y_train1, batch_size=8)

In [None]:
model.evaluate(X_test1, Y_test1, batch_size=8)

In [None]:
model.evaluate(X_e1, Y_e1, batch_size=8)

In [None]:
for i in range(20):
    plt.figure(figsize=(4,8))
    plt.imshow(X_e1[i]); plt.gray();
    plt.show()