In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Flatten, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import pickle
from sklearn.utils import class_weight
from sklearn import metrics
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import backend as K
from tensorflow.keras import utils
import seaborn as sns

from keras import metrics as kerasMet

In [2]:
# Kreira tokenizer i vokabular reci

def get_vocabulary(num_words, text, name):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(text)
    
    # cuvanje tokenizera
    with open('data/' + name, 'wb') as f:
        pickle.dump(tokenizer, f)
    
    word_index = tokenizer.word_index
    
    return word_index, tokenizer

In [3]:
# Pretvara u sekvence

def to_sequence(text, maxlen, tokenizer):
    
    sequences = tokenizer.texts_to_sequences(text)
    padded = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
    
    return padded 

In [4]:
# Kreira matricu ugnjezdjavanja za glove 

def create_embedding_matrix(word_index, embedding_dim):
    word_embeddings = {}
    glove_dim = embedding_dim
    with open('../glove.6B.' + str(glove_dim) + 'd.txt', 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
        
        
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            break
        
        embedding_vector = word_embeddings.get(word)
    
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [5]:
# funkcija koja kreira model - parametri su broj klasa(bice 2 ili 6), velicina vokabulara, 
# dimenzija ugnjezdjavanja, parametri konvolutivnog sloja, maksimalna duzina reci, 
# korak ucenja za adam optimizator, matrica ugnjezdjavnja i broj slojeva u potpuno povezanoj mrezi na kraju

# mreza se sastoji od embedding sloja, zatim dva konvolucijska i dva pooling sloja i na kraju
# su isrpobane razlicite dubine sa dodavanjem obicnih Dense slojeva

def create_model(num_of_classes, num_of_features, embedding_dim, 
                kernel_size, pool_size, filters, dropout_rate, maxlen,
                lr, embedding_matrix, layers):
    
    # za izlazni sloj mreze, u zavisnosti da li je binarna ili viseklasna
    # razlikuje se broj jedinica
    if num_of_classes == 2:
        units = 1
    else:
        units = num_of_classes
    
   
    model = Sequential()
    model.add(Embedding(input_dim = num_of_features,
                 output_dim = embedding_dim,
                 input_length = maxlen,
                 weights = [embedding_matrix],
                 trainable = False))

    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters=filters*2, kernel_size=kernel_size, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Dropout(rate=dropout_rate))
    model.add(GlobalAveragePooling1D())
    model.add(Flatten())
    for i in range(layers):
        model.add(Dense(10, activation='relu'))
    
    model.add(Dense(units, activation='sigmoid'))
    
    model.summary()

    model.compile(loss='binary_crossentropy', optimizer=Adam(lr), metrics=[kerasMet.Precision(name='precision'),
                                                                           kerasMet.Recall(name='recall'),
                                                                           kerasMet.AUC(name='auc'),
                                                                           f1_m])
        
    return model

In [6]:
# Funkcija koja racuna tezine za razlicite klase
# Dodelice vece tezine instancama manjinske klase

def get_weights(y_train, n_classes):
    if n_classes == 2:
        weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
        weights = dict(enumerate(weights))
    else:
        weights = get_weights_multi(y_train)
    
    return weights

In [7]:
def get_weights_multi(data):
    n_samples = data.shape[0]
    n_classes = 6
    
    class_count = [0] * n_classes
    for row in data:
        for index in range(n_classes):
            if row[index] != 0:
                class_count[index] += 1
                
    weights = [n_samples / (n_classes * freq) if freq > 0 else 1 for freq in class_count]
    class_labels = range(len(weights)) 
    
    return dict(zip(class_labels, weights))

In [8]:
def randomized_search_cv(X, y, weights, embedding_matrix):
    # recnik parametara koje prima funkcija za kreiranje modela
    params = {'num_of_classes' : [2],
              'num_of_features' : [20000],
              'embedding_dim' : [100],
              'kernel_size' : [1, 2, 3],
              'pool_size' : [3, 5],
              'filters' : [32, 64, 128],
              'dropout_rate' : [0.2, 0.4, 0.5],
              'maxlen' : [100],
              'lr' : [0.00001, 0.0001, 0.001],
              'embedding_matrix' : [embedding_matrix],
              'layers' : [0, 1, 2]
              }
    

    model_random = KerasClassifier(build_fn = create_model, epochs=10, batch_size=128, verbose=False)

    randomized_cv = RandomizedSearchCV(estimator=model_random, param_distributions=params, cv=5, verbose=1, n_iter=5, scoring='f1_micro')
    randomized_cv.fit(X, y, class_weight=weights)
    
    return randomized_cv

In [9]:
# Funkcija koja plotuje grafike za metrike tokom treniranja

def plot_graphs(history, metrics):
    
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.title(str(metrics[0]))
    accuracy = history.history[str(metrics[0])]
    val_accuracy = history.history['val_' + metrics[0]]
    plt.plot(range(0, history.epoch[-1]+1), accuracy, color='skyblue', label='training')
    plt.plot(range(0, history.epoch[-1]+1), val_accuracy, color='pink', label='validation')
    plt.xlabel('epochs')
    plt.legend(loc='best')
    
    plt.subplot(1, 3, 2)
    plt.title(str(metrics[1]))
    loss = history.history[str(metrics[1])]
    val_loss = history.history['val_' + str(metrics[1])]
    plt.plot(range(0, history.epoch[-1]+1), loss, color='skyblue', label='training')
    plt.plot(range(0, history.epoch[-1]+1), val_loss, color='pink', label='validation')
    plt.xlabel('epochs')
    plt.legend(loc='best')

    plt.subplot(1, 3, 3)
    plt.title(str(metrics[2]))
    accuracy = history.history[str(metrics[2])]
    val_accuracy = history.history['val_' + metrics[2]]
    plt.plot(range(0, history.epoch[-1]+1), accuracy, color='skyblue', label='training')
    plt.plot(range(0, history.epoch[-1]+1), val_accuracy, color='pink', label='validation')
    plt.xlabel('epochs')
    plt.legend(loc='best')
    
    plt.show()

In [10]:
# U slucaju binarne klasifikacije, prikazuje dve iste matrice konfuzije - jednu sa vrednostima , drugu sa procentima 
# U slucaju multilabel, prikazuje 6 matrica konfuzija - za svaku od klasa

def plot_cf(cf, name, labels):
    

    if name=='binary':
        plt.figure(figsize=(14, 5))
        ax = plt.subplot(1, 2, 1)
        ax.set_title('Matrica konfuzije')
    
        sns.heatmap(cf, annot=True, fmt='d', cmap='Blues', linecolor = 'blue')
        ax.set_xlabel('Predicted label')
        ax.set_ylabel('True label')
        ax.xaxis.set_ticklabels(['untoxic', 'toxic'])
        ax.yaxis.set_ticklabels(['untoxic', 'toxic'])
    
        ax1 = plt.subplot(1, 2, 2)
        ax1.set_title('Matrica konfuzije - procenti')
    
        sns.heatmap(cf/np.sum(cf), annot=True, fmt='.2%', cmap='Blues')
        ax1.set_xlabel('Predicted label')
        ax1.set_ylabel('True label')
        ax1.xaxis.set_ticklabels(['untoxic', 'toxic'])
        ax1.yaxis.set_ticklabels(['untoxic', 'toxic'])
        
        plt.savefig('plots and images/confusion_matrix_binary.png')
        plt.show()
        
    else:
        plt.figure(figsize=(20, 10))
        i = 0
        for matrix in cf:
            ax = plt.subplot(2, 3, i+1)
            
            sns.heatmap(matrix, annot=True ,fmt='d', cmap='Blues')
            ax.set_title(labels[i])
            ax.set_xlabel('Predicted label')
            ax.set_ylabel('True label')
            i += 1
        
        plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
        
        plt.savefig('plots and images/confusion_matrix_multi.png')
        plt.show()

In [14]:
#METRICS = [
#      kerasMet.TruePositives(name='tp'),
#      kerasMet.FalseNegatives(name='fn'),
#      kerasMet.TrueNegatives(name='tn'),
#      kerasMet.FalsePositives(name='fp'), 
#      kerasMet.BinaryAccuracy(name='accuracy'),
#      kerasMet.Precision(name='precision'),
#      kerasMet.Recall(name='recall'),
#      kerasMet.AUC(name='auc')]
#      kerasMet.AUC(name='prc', curve='PR'), # precision-recall curve

# Funkcije za metrike koje se prate

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [12]:
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)
# F1 = 2*Precision*Recall / (Precision + Recall)

def get_prec_rec_f1(cf, labels):

    precs = {}
    recalls = {}
    f1_scores = {}

    i = 0
    for matrix in cf:
        precs[labels[i]] = round(matrix[1][1] / (matrix[1][1] + matrix[0][1]), 2)
        recalls[labels[i]] = round(matrix[1][1] / (matrix[1][1] + matrix[1][0]), 2)
        f1_scores[labels[i]] = round(((2*precs[labels[i]] * recalls[labels[i]]) / (precs[labels[i]] + recalls[labels[i]])), 2)
        
        i += 1
        
    return precs, recalls, f1_scores

In [13]:
# brojnost po klasama:
def get_label_counts(data, labels):
    
    label_counts = {}
    for label in labels:
        label_counts[label] = (data[data[label]==1].shape[0])
    
    return label_counts