Radi čitljivosti i organizacije koda, u ovoj svesci su izdvojene funkcije koje su zajedničke za binarnu i višelabelarnu klasifikaciju.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Flatten, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

import pickle
from sklearn.utils import class_weight
from sklearn import metrics
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from keras import metrics as kerasMet

In [2]:
def get_vocabulary(num_words, train_data, name):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(train_data['comment_text'])
    
    # cuvanje tokenizera
    with open('data/' + name, 'wb') as f:
        pickle.dump(tokenizer, f)
    
    word_index = tokenizer.word_index
    
    return word_index, tokenizer

In [3]:
def to_sequence(text, maxlen, tokenizer):
    
    sequences = tokenizer.texts_to_sequences(text)
    padded = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
    
    return padded 

In [4]:
def create_embedding_matrix(word_index, embedding_dim):
    word_embeddings = {}
    with open('data/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
        
        
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            break
        
        embedding_vector = word_embeddings.get(word)
    
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [5]:
# funckija koja kreira model

def create_model(num_of_classes, num_of_features, embedding_dim, 
                kernel_size, pool_size, filters, dropout_rate, maxlen,
                embedding_matrix):
    
    # za poslednji sloj, u zavisnosti da li je binarna ili viseklasna
    # razlikuju se aktivaciona funkcija i broj jedinica
    if num_of_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_of_classes
        
    
    model = Sequential([
        
        Embedding(input_dim = num_of_features,
                 output_dim = embedding_dim,
                 input_length = maxlen,
                 weights = [embedding_matrix],
                 trainable = False),
        
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same'),
        MaxPooling1D(pool_size=pool_size),
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same'),
        MaxPooling1D(pool_size=pool_size),
        Dropout(rate=dropout_rate),
        GlobalAveragePooling1D(),
        Dense(units, activation=activation)
    ])
    
    model.summary()
    
    return model

In [6]:
# Funkcije za metrike koje se prate

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [7]:
# Funkcija koja racuna tezine za razlicite klase
# Dodelice vece tezine instancama manjinske klase

def get_weights(y_train):
    weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
    weights = dict(enumerate(weights))

    return weights

In [8]:
# Funkcija koja plotuje grafike za metrike tokom treniranja

def plot_graphs(history, metrics):
    
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.title(str(metrics[0]))
    accuracy = history.history[str(metrics[0])]
    val_accuracy = history.history['val_' + metrics[0]]
    plt.plot(range(0, history.epoch[-1]+1), accuracy, color='red', label='training')
    plt.plot(range(0, history.epoch[-1]+1), val_accuracy, color='orange', label='validation')
    plt.xlabel('epochs')
    plt.legend(loc='best')
    
    plt.subplot(1, 2, 2)
    plt.title(str(metrics[1]))
    loss = history.history[str(metrics[1])]
    val_loss = history.history['val_' + str(metrics[1])]
    plt.plot(range(0, history.epoch[-1]+1), loss, color='red', label='training')
    plt.plot(range(0, history.epoch[-1]+1), val_loss, color='orange', label='validation')
    plt.xlabel('epochs')
    plt.legend(loc='best')
    
    plt.show()

In [9]:
#METRICS = [
#      metrics.TruePositives(name='tp'),
 #     metrics.FalseNegatives(name='fn'),
     # metrics.TrueNegatives(name='tn'),
     # metrics.FalsePositives(name='fp'), 
     # metrics.BinaryAccuracy(name='accuracy'),
  #    metrics.Precision(name='precision'),
   #   metrics.Recall(name='recall')]

# metrics.AUC(name='auc'),
# metrics.AUC(name='prc', curve='PR'), # precision-recall curve
