In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional, GlobalAveragePooling1D, Flatten, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras import preprocessing
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import metrics
from keras.callbacks import EarlyStopping
from keras import backend as K
from tensorflow.keras import utils

import pickle
from sklearn.utils import class_weight
import sklearn
import time



Exception: File `'SharedFunctions.py'` not found.

In [None]:
data = pd.read_csv('data/data_longer.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.shape

### 1. Podela na trening, test, i validacioni skup

In [None]:
# Ovo je prepisano iz prethodne sveske

train_size = 159571 - 96
test_size = 63978
validation_size = int(0.2*train_size)
train_size = train_size - validation_size

validation_data = data[:validation_size]
train_data = data[validation_size:train_size+validation_size]
test_data = data[train_size+validation_size : ]

train_data.shape[0], test_data.shape[0], validation_data.shape[0]

In [None]:
y_train = train_data['target']
y_test = test_data['target']
y_validation = validation_data['target']

In [None]:
train_data[train_data['target']==0].shape

In [None]:
train_data[train_data['target']==1].shape

### 2. Tokenizacija teksta i transformacija u vektore

In [None]:
# uzima se recimo 20000 najfrekventinijh reci
max_features = 20000

# max duzina
max_len = 100

Instance će biti podeljene na reči, izdvojiće se neki broj najfrekventnijih reči, a onda će biti pretvorene u vektore reči sekvenci

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_data['comment_text'])

# vokabular:
word_index = tokenizer.word_index

# broj jedinstvenih tokena:
len(word_index)

In [None]:
# cuvanje tokenizera
with open('data/tokenizer_undersampled.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
def to_sequence(text, maxlen):
    
    sequences = tokenizer.texts_to_sequences(text)
    padded = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
    
    return padded 

In [None]:
# Podaci za binarnu klasifikaciju
X_train = to_sequence(train_data['comment_text'], max_len)
X_test = to_sequence(test_data['comment_text'], max_len)
X_validation = to_sequence(validation_data['comment_text'], max_len)

In [None]:
# Kreiranje embedding matrice na osnovu Glove

In [None]:
def create_embedding_matrix(word_index, embedding_dim):
    word_embeddings = {}
    with open('data/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
        
        
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            break
        
        embedding_vector = word_embeddings.get(word)
    
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [None]:
embedding_matrix = create_embedding_matrix(word_index, max_len)

### 3. Modeli

Model će biti isti za binarnu i višeklasnu klasifikaciju. Jedina razlika u broju jedinica i aktivacionoj funkciji poslednjeg sloja mreže. Funkcija će imati parametar za broj klasa. 

In [None]:
# funckija koja kreira model

def create_model(num_of_classes, num_of_features, embedding_dim, 
                kernel_size, pool_size, filters, dropout_rate, maxlen,
                embedding_matrix):
    
    # za poslednji sloj, u zavisnosti da li je binarna ili viseklasna
    # razlikuju se aktivaciona funkcija i broj jedinica
    if num_of_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_of_classes
        
    
    model = Sequential([
        
        Embedding(input_dim = num_of_features,
                 output_dim = embedding_dim,
                 input_length = maxlen,
                 weights = [embedding_matrix],
                 trainable = False),
        
        Bidirectional(LSTM(64, return_sequences=True)),
        
        Dropout(dropout_rate),
        Dense(64, activation='relu'),
        Dense(units, activation=activation)
    ])
    
    model.summary()
    
    return model

In [None]:
# Funkcije za metrike koje se prate

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# Funkcija koja racuna tezine za razlicite klase
# Dodelice vece tezine instancama manjinske klase

def get_weights(y_train):
    weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
    weights = dict(enumerate(weights))

    return weights

In [None]:
# Funkcija koja plotuje grafike za metrike tokom treniranja

def plot_graphs(history, metrics):
    
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.title(str(metrics[0]))
    accuracy = history.history[str(metrics[0])]
    val_accuracy = history.history['val_' + metrics[0]]
    plt.plot(range(0, history.epoch[-1]+1), accuracy, color='red', label='training')
    plt.plot(range(0, history.epoch[-1]+1), val_accuracy, color='orange', label='validation')
    plt.xlabel('epochs')
    plt.legend(loc='best')
    
    plt.subplot(1, 2, 2)
    plt.title(str(metrics[1]))
    loss = history.history[str(metrics[1])]
    val_loss = history.history['val_' + str(metrics[1])]
    plt.plot(range(0, history.epoch[-1]+1), loss, color='red', label='training')
    plt.plot(range(0, history.epoch[-1]+1), val_loss, color='orange', label='validation')
    plt.xlabel('epochs')
    plt.legend(loc='best')
    
    plt.show()

In [None]:
kernel_size = 3
pool_size = 3
filters = 64
dropout_rate = 0.3

In [None]:
binary_model = create_model(num_of_classes = 2, num_of_features = max_features, embedding_dim = 100, 
                kernel_size = kernel_size, pool_size = pool_size, filters = filters, 
                dropout_rate = dropout_rate, maxlen = max_len, embedding_matrix = embedding_matrix)

In [None]:
lr = 0.0001

In [None]:
binary_model.compile(loss='binary_crossentropy', optimizer=Adam(lr), metrics=[recall_m, precision_m, f1_m])
# multiclass_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr), metrics=[recall_m, precision_m, f1_m])

In [None]:
weights_binary = get_weights(y_train)
#weights_multi = get_weights(y_train_multi)

### Treniranje modela

In [None]:
epochs = 10
batch_size = 64

In [None]:
start = time.time()    

binary_history = binary_model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, 
                                 validation_data=(X_validation, y_validation), class_weight=weights_binary 
                                 #weights = weights_binary ,callbacks=[early_stop]
                                 )

print('Trajanje obucavanja modela binarne klasifikacije: ', time.time() - start)

In [None]:
plot_graphs(binary_history, ['f1_m', 'loss'])

In [None]:
scores = binary_model.evaluate(X_test, y_test)

In [None]:
y_pred = binary_model.predict(X_test)

In [None]:
sklearn.metrics.confusion_matrix(y_test, np.argmax(y_pred, axis=1))