In [1]:
import glob
import cv2
import pandas as pd
import pyreadr
import numpy as np
import ntpath
import re
import time
from PIL import Image
import os
import pickle
import random
import math

from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
input_shape = (200,200,3)

In [3]:
base_model = tf.keras.applications.ResNet50V2(
    input_shape=input_shape, include_top=False, weights='imagenet'
)

In [4]:
base_model.summary()

Model: "resnet50v2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200, 200, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 206, 206, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 100, 100, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
pool1_pad (ZeroPadding2D)       (None, 102, 102, 64) 0           conv1_conv[0][0]                 
_________________________________________________________________________________________

In [6]:
data_augmentation = tf.keras.models.Sequential(
  [
   tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal", 
                                                 input_shape = input_shape),
    tf.keras.layers.experimental.preprocessing.RandomRotation(0.1),
    tf.keras.layers.experimental.preprocessing.RandomZoom(0.1),
  ]
)

In [11]:
inputs = tf.keras.Input(shape=input_shape)
x = data_augmentation(inputs)
# x = preprocess_input(inputs)
x = base_model(x, training=True)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs, outputs)

In [38]:
base_learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.001),
              loss=tf.keras.losses.binary_crossentropy,
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()])

In [13]:
model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 200, 200, 3)]     0         
_________________________________________________________________
sequential_1 (Sequential)    (None, 200, 200, 3)       0         
_________________________________________________________________
resnet50v2 (Functional)      (None, 7, 7, 2048)        23564800  
_________________________________________________________________
global_average_pooling2d_1 ( (None, 2048)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2049      
Total params: 23,566,849
Trainable params: 23,521,409
Non-trainable params: 45,440
_____________________________________

In [31]:
class StockDatagen:
    def __init__(self, img_dir: str, label_dir: str, val_size: int):
        
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.val_size = val_size
        self.img_paths = self.__generate_paths(path_img, "*.png")


    def get_training_generators(self, batch_size: int):
        
        train_gen = DataGenerator(label_path = path_labels, batch_size = 64,
                                  img_paths = self.img_paths[:len(self.img_paths) - self.val_size])
        valid_gen = DataGenerator(label_path = path_labels, batch_size = 64,
                                  img_paths = self.img_paths[len(self.img_paths) - self.val_size:])

        return train_gen, valid_gen
    
    def __generate_paths(self, path, pattern):
        return glob.glob(path + pattern)

In [29]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, label_path: str, batch_size: int, img_paths):

        self.batch_size = batch_size
        self.image_paths = img_paths
        self.label_df = self.__generate_label_df(label_path)

    def __len__(self):
        return math.ceil(len(self.image_paths) / self.batch_size)

    def __getitem__(self, index):
        from_idx = index * self.batch_size
        to_idx =  len(self.image_paths) if (index + 1) * self.batch_size > len(self.image_paths) else (index + 1) * self.batch_size
        image_paths = self.image_paths[from_idx : to_idx]      
        
        data_train = []
        data_labels = []
        for image_path in image_paths:
            image_name = ntpath.basename(image_path)
            ticker, label_idx = self.__get_ticker_index(image_name)
            img = cv2.imread(image_path)
            data_train.append(img)
            data_labels.append(self.label_df[ticker].loc[int(label_idx) - 1])
            
        return np.multiply(np.asarray(data_train),1/255), np.asarray(data_labels)

    def __generate_paths(self, path, pattern):
        """
            Creates an array of paths
        """
        return glob.glob(path + pattern)
        
    def __generate_label_df(self, label_path):
        labels_paths = self.__generate_paths(label_path, "*.rda")
#         print(labels_paths)
        labels_df = pd.DataFrame()
        
        for label_file_path in labels_paths:
            label_file_name = ntpath.basename(label_file_path)
            label_df = pyreadr.read_r(label_file_path)
            ticker = label_file_name[:label_file_name.rfind('-')]
            labels_df[ticker] = label_df['labels']['labels'].to_numpy()
        
        return labels_df 
    
    def __get_ticker_index(self, s):
        head = s.split('.')[0].rstrip('0123456789')
        tail = s[len(head):len(s) - 4]
        return head, tail
    
    def on_epoch_end(self):
        self.images = random.shuffle(self.image_paths)

In [None]:
path_img = "/mnt/data/home/kantek/wu/data/img/"
path_labels = "/mnt/data/home/kantek/wu/data/"

In [16]:
path_img = "/tmp/img_data/img/"
path_labels = "/tmp/img_data/"

In [32]:
datagen = StockDatagen(path_img, path_labels, 20000)

In [33]:
train_gen, valid_gen = datagen.get_training_generators(64)

In [34]:
def scheduler(epoch, lr):
    if epoch <= 2:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

In [35]:
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [36]:
history20 = model.fit(train_gen,
                    validation_data = valid_gen,
                    epochs = 8,
                    callbacks=[callback],
                    workers = 16)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [37]:
history21 = model.fit(train_gen,
                    validation_data = valid_gen,
                    epochs = 8,
                    callbacks=[callback],
                    workers = 16)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [39]:
history22 = model.fit(train_gen,
                    validation_data = valid_gen,
                    epochs = 4,
                    callbacks=[callback],
                    workers = 16)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
history = model.fit(X_train,y_train,
                    batch_size = 64,
                    epochs = 4,
                    validation_data = (X_val, y_val),
                    workers = 1)

In [None]:
model.save("mobilenetV2_best")

In [None]:
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.00001),
              loss=tf.keras.losses.binary_crossentropy,
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train,y_train,
                    batch_size = 32,
                    epochs = 4,
                    validation_data = (X_val, y_val),
                    workers = 1)

In [None]:
history10 = model.fit(X_train,y_train,
                    batch_size = 32,
                    epochs = 3,
                    validation_data = (X_val, y_val),
                    workers = 1)

In [None]:
history = model.fit(X_train,y_train,
                    batch_size = 64,
                    epochs = 4,
                    validation_data = (X_val, y_val),
                    workers = 1)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(200,200,3)))
model.add(tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

model.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer=tf.keras.optimizers.Adam(lr=0.001),
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train,y_train,
                    batch_size = 64,
                    epochs = 4,
                    validation_data = (X_val, y_val),
                    workers = 1)

In [None]:
model.save('resnet50_best')

In [None]:
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.00001),
              loss=tf.keras.losses.binary_crossentropy,
              metrics=['accuracy'])

In [None]:
history2 = model.fit(X_train,y_train,
                    batch_size = 16,
                    epochs = 4,
                    validation_data = (X_val, y_val),
                    workers = 1)

In [None]:
history2 = model.fit(X_train,y_train,
                    batch_size = 256,
                    epochs=4,
                    validation_data=(X_val, y_val),
                    workers=16)

In [None]:
plt.hist(labels)

In [None]:
plt.hist(y_train)

In [None]:
plt.hist(y_val)