In [3]:
import sys
sys.path.append("../")
import pickle
import tensorflow as tf
import numpy as np
import nltk
import pandas as pd
import gensim.downloader
import gensim
import re
from typing import Optional
import swifter
from PythonCode.preprocess.common import load_data
from sklearn.model_selection import train_test_split
from keras.layers import Dense, GRU, AvgPool1D
from keras.models import Sequential
from sklearn.metrics import classification_report
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
print("downloading pretrained embedding model.\nthis may take a while...")
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

downloading pretrained embedding model.
this may take a while...


In [37]:
EMBEDDING_SIZE = 50
NUM_OF_SENTENCE_CHUNK = 3
MAX_LENGTH = 170
TEST_PART = 0.1
VALIDATION_PART = 0.1
MAX_SENTENCE_LENGTH = 70
MAX_NUMBER_OF_SENTENCE = 45
DATA_PATH = "../Data/C50"

In [39]:
from keras.layers import Masking

def tranform_word(word: str) -> Optional[np.ndarray]:
    word = re.sub(r'[^a-z]', '', word.lower())
    if word in glove_vectors:
        return glove_vectors[word]
    return None


def complex_tranform_word(word: str):
    result = tranform_word(word)
    if result is None:
        token = lemmatizer.lemmatize(word)
        if token in glove_vectors:
            return glove_vectors[token]
        token = stemmer.stem(word)
        if token in glove_vectors:
            return glove_vectors[token]
    return result


def pad_matrix(arr: np.ndarray, max_length: int) -> Optional[np.ndarray]:
    if arr.size == 0:
        return None
    if arr.shape[0] == max_length:
        return arr
    if arr.shape[0] > max_length:
        return arr[:max_length, :]
    return np.concatenate([arr, np.zeros((max_length - arr.shape[0], arr.shape[1]))], axis=0, dtype=float)


def get_datasets(data_path: str = "../Data/C50") -> pd.DataFrame:
    df_test = load_data(f"{data_path}/C50test", 50)
    df_train = load_data(f"{data_path}/C50train", 50)
    return df_train.append(df_test, ignore_index=True)


def preprocess_labels(y: pd.Series) -> np.ndarray:
    y_codes = pd.Categorical(y).codes
    one_hot = tf.keras.utils.to_categorical(
        y_codes, num_classes=pd.Series(y_codes).unique().size, dtype='float32'
    )
    return np.expand_dims(one_hot, axis=1)


def pad_array(arr: np.ndarray, pad_size: int):  # TODO: reuse pad_matrix instead
    if arr.size == pad_size:
        return arr
    elif arr.size > pad_size:
        return arr[:pad_size, ]
    return np.concatenate([arr, np.zeros(pad_size - arr.size)], dtype=float)


def article_level_preprocess_helper(text: str):
    sentences = nltk.sent_tokenize(text)
    result = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        curr_result = []
        for word in words:
            embedding = tranform_word(word)
            if embedding is not None:
                curr_result.append(embedding)
        if len(curr_result) != 0:
            result.append(pad_array(np.array(curr_result, dtype=float).mean(axis=1, dtype=float), MAX_SENTENCE_LENGTH))
    return pad_matrix(np.array(result), MAX_NUMBER_OF_SENTENCE)


def article_level_preprocess(df: pd.DataFrame):
    def helper(X):
        res = X.swifter.apply(article_level_preprocess_helper).reset_index(drop=True)
        return np.vstack(res).reshape((res.size, MAX_NUMBER_OF_SENTENCE, MAX_SENTENCE_LENGTH))

    X_train, X_test, y_train, y_test = train_test_split(df["book_text"], df["author_name"], test_size=TEST_PART)
    return helper(X_train), helper(X_test), preprocess_labels(y_train), preprocess_labels(y_test)


In [6]:
df = get_datasets(DATA_PATH)

In [40]:
X_train, X_test, y_train, y_test = article_level_preprocess(df)

Pandas Apply:   0%|          | 0/4500 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/500 [00:00<?, ?it/s]

In [43]:
model = Sequential()
model.add(Masking(mask_value=0., input_shape=(MAX_NUMBER_OF_SENTENCE, MAX_SENTENCE_LENGTH)))
model.add(GRU(200,recurrent_dropout=0.2,return_sequences=True,recurrent_regularizer=tf.keras.regularizers.l1_l2(l1=0.001,l2=0.001)))
model.add(AvgPool1D(pool_size=(MAX_NUMBER_OF_SENTENCE,)))
model.add(Dense(50, activation="softmax"))#kernel_regularizer= tf.keras.regularizers.l1_l2(l1=0.001,l2=0.001)
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["accuracy"])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_5 (Masking)          (None, 45, 70)            0         
_________________________________________________________________
gru_5 (GRU)                  (None, 45, 200)           163200    
_________________________________________________________________
average_pooling1d_5 (Average (None, 1, 200)            0         
_________________________________________________________________
dense_8 (Dense)              (None, 1, 50)             10050     
Total params: 173,250
Trainable params: 173,250
Non-trainable params: 0
_________________________________________________________________


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATION_PART)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
model_name = "article_based_model"
# model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=f"./{model_name}-checkpoints",
#                                                                save_weights_only=False,
#                                                                monitor='val_accuracy', mode='max',
#                                                                save_best_only=True)

In [44]:
history = model.fit(x=X_train, y=y_train, epochs=50, shuffle=True,
                    batch_size=32, validation_data=(X_val, y_val), callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
with open(f"{model_name}-history", "wb") as file:
    pickle.dump(history, file)
model.save(model_name)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_pred.argmax(axis=-1).flatten(),y_test.argmax(axis=-1).flatten()))