In [1]:
import os
import pickle
import random as rn
import warnings
from multiprocessing import cpu_count

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import Word2Vec
from IPython.display import display
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, average_precision_score, \
    ndcg_score, label_ranking_average_precision_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import shuffle
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import Constant, GlorotUniform
from tensorflow.keras.layers import Dense, Dropout, Activation, Embedding, Conv1D, \
    GlobalMaxPooling1D, SpatialDropout1D, LSTM, GRU, Flatten, MaxPooling1D, \
    BatchNormalization, ReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import data
import preprocessing

seed = 42

def reset_seed():
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ["TF_DETERMINISTIC_OPS"] = "1"
    np.random.seed(seed)
    rn.seed(seed)
    tf.random.set_seed(seed)

## Extracting the data

In [2]:
# data.extract_data(extraction_dir="train", data_dir="data", data_zip_name="reuters-training-corpus.zip")

df = pd.read_pickle("train/data.pkl")

# df = data.get_docs_labels("train/REUTERS_CORPUS_2")
# df.to_pickle("train/data.pkl")

original_docs = df["doc"].values
labels = np.array(df["labels"].tolist())
n_labels = len(data.CODEMAP)

print(original_docs.shape)
print(labels.shape)
print(original_docs[2])
print(labels[2])

(299773,)
(299773, 126)
Toronto stocks end higher after volatile session. CHANGE				    CHANGE TSE	  5900.37    +50.15   HI 5900.37	    LO  5840.29 DJI	  6611.05    +27.57   GOLD (LONDON)   US$350.00 +1.90 FTSE100    4248.10    -64.80   GOLD (NY-COMEX) US$354.80 +0.70 NIKKEI    17869.59   -133.81   LME CASH NICKEL US$7659   +99.0 CANDLR	1.3883		 LME CASH ALUM   US$1602.0  -4.0 CAN 30-YR   107.41     -0.15   BRENT CRUDE     US$19.09  -0.27 --------------------MARKET COMMENT---------------------------- * Toronto stocks ended higher on Tuesday, buoyed by strength in golds and banking * Computer problems due to heavy trading in Bre-X Minerals hampered session * 84 million shares traded Toronto's key stock index ended higher on Tuesday as the saga of Bre-X Minerals Ltd and its Indonesian gold find continued to dominate Canada's biggest stock market. The TSE 300 Index climbed 50.15 points to close at 5900.37 in heavy turnover of 84.07 million shares worth C$1.4 billion. But the overall marke

## Preprocessing the documents

In [3]:
with open("train/preprocessed_docs_lemmatized_no_sw.pkl", "rb") as f:
    preprocessed_docs = pickle.load(f)

# preprocessed_docs = preprocessing.preprocess_corpus(original_docs)
# with open("train/preprocessed_docs.pkl", "wb") as f:
#     pickle.dump(preprocessed_docs, f)

print(preprocessed_docs[2])

toronto stock end high volatile session change change tse 5900.37 +50.15 hi 5900.37 lo 5840.29 dji 6611.05 +27.57 gold london us$ 350.00 +1.90 ftse100 4248.10 -64.80 gold ny comex us$ 354.80 +0.70 nikkei 17869.59 -133.81 lme cash nickel us$ 7659 +99.0 candlr 1.3883 lme cash alum us$ 1602.0 -4.0 30-yr 107.41 -0.15 brent crude us$ 19.09 -0.27 --------------------market comment---------------------------- toronto stock end higher tuesday buoy strength gold banking computer problem heavy trading bre x minerals hamper session 84 million share trade toronto key stock index end higher tuesday saga bre x minerals ltd indonesian gold find continue dominate canada big stock market tse 300 index climb 50.15 point close 5900.37 heavy turnover 84.07 million share worth c$ 1.4 billion overall market mix decline issue narrowly outpace advance 476 464 298 issue flat frantic trading bre x collapse tse computer trading system earlier day force exchange halt trading stock market close share calgary base 

## Representing the documents

In [4]:
docs = preprocessed_docs
n_vocabulary = 5000

### As token index sequences

In [5]:
n_sequence = 64

In [6]:
tokenizer = Tokenizer(num_words=n_vocabulary, filters="")
tokenizer.fit_on_texts(docs)
word_idx = tokenizer.word_index

if n_vocabulary is None:
    n_vocabulary = len(word_idx)

print(n_vocabulary)

5000


In [7]:
if n_sequence is None:
    n_sequence = max([len(doc) for doc in docs])

sequences = tokenizer.texts_to_sequences(docs)
sequences = pad_sequences(sequences, maxlen=n_sequence, padding="post", truncating="post")

print(n_sequence)
print(sequences.shape)
print(sequences[2])

64
(299773, 64)
[1610   24   17   19 2090  436  114  114 4443  240  141  492  240 2564
 2406  492 2264 1386  263 1863  492 1386  263  492  914  407  492 1610
   24   17  700   37 3599  827  240  567  674  327  617  162 2578 1520
 4945 4521  436 3271    5    7   20 1610  328   24   75   17  700   37
 2578 1520 4945  151 1456  240  314  112]


### As document-term matrix

In [8]:
# doc_matrix = tokenizer.texts_to_matrix(docs, mode="tfidf")

# print(doc_matrix.shape)
# print(doc_matrix[2])

## Word embeddings

In [9]:
n_embedding = 256

### Word2Vec

In [10]:
reset_seed()

window = 5
w2v_path = f"train/w2v_{window}_{n_embedding}.model"

try:
    embedding_model = Word2Vec.load(w2v_path)
except:
    embedding_model = Word2Vec(sentences=[s.split() for s in docs],
                               size=n_embedding, 
                               window=window,
                               min_count=5,
                               sg=1,
                               workers=cpu_count(),
                               seed=seed)
    embedding_model.save(w2v_path)

print(len(list(embedding_model.wv.vocab)))

160759


### Creating embedding matrix

In [11]:
embedding_matrix = np.zeros((n_vocabulary, n_embedding))
for token, i in word_idx.items():
    if i >= n_vocabulary:
        continue
    if token in embedding_model:
        embedding_matrix[i] = embedding_model[token]
    else:
        embedding_matrix[i] = np.zeros(n_embedding)

print(embedding_matrix.shape)
print(embedding_matrix[1])

(5000, 256)
[ 0.17539258 -0.06530272  0.01657451 -0.04425292  0.04128651 -0.18300489
 -0.16919801  0.0480488  -0.09133328  0.02587506 -0.01967425  0.09945878
 -0.05600481 -0.01685121 -0.03599447 -0.14082797 -0.01086318 -0.07289494
  0.0402003  -0.04027929  0.18171482 -0.04842717  0.03684806 -0.0510481
  0.20940183 -0.09696906 -0.08415043  0.0408814  -0.1309191  -0.13459311
  0.46689954 -0.18623862  0.05366851 -0.05996224 -0.13319956 -0.24100822
  0.05342331  0.14736585 -0.25805563  0.31074879 -0.32781357 -0.07095104
 -0.28958547  0.1536327   0.02000534 -0.08507846  0.00419282  0.01348357
  0.06239876  0.19682285  0.19862361  0.06192612 -0.26880991  0.2868703
 -0.0284104   0.15227671 -0.22509414 -0.03397212  0.19979669 -0.04593811
  0.1099792   0.01548944  0.16921869 -0.05802288 -0.32499835 -0.22160961
  0.05431951 -0.05393629 -0.12470939 -0.03527912 -0.13933805 -0.10993122
 -0.01807112  0.18888257  0.37824026 -0.01249813 -0.03216061  0.16550902
 -0.11490405 -0.01706733 -0.0514022   0.1

## Defining the NN model

In [12]:
def init_model():
    model = Sequential()
    
#     model.add(Embedding(input_dim=n_vocabulary, output_dim=n_embedding, input_length=n_sequence))
    model.add(Embedding(
        input_dim=n_vocabulary,
        output_dim=n_embedding,
        embeddings_initializer=Constant(embedding_matrix),
        input_length=n_sequence,
        trainable=False
    ))

    model.add(Dropout(.25))
    model.add(Conv1D(64, 5, activation="relu"))
    model.add(Dropout(.25))
    model.add(Conv1D(128, 5, activation="relu"))
    model.add(Dropout(.25))
    model.add(Flatten())
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(.25))
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(.25))
    
#     model.add(GRU(128, dropout=.2))

#     model.add(Dropout(.25))
#     model.add(Conv1D(64, 5, activation="relu"))
#     model.add(Dropout(.25))
#     model.add(Conv1D(128, 5, activation="relu"))
#     model.add(Dropout(.25))
#     model.add(Flatten())
#     model.add(Dense(128))
#     model.add(BatchNormalization())
#     model.add(ReLU())
#     model.add(Dropout(.25))
#     model.add(Dense(128))
#     model.add(BatchNormalization())
#     model.add(ReLU())
#     model.add(Dropout(.25))

#     model.add(Bidirectional(LSTM(256, return_sequences=True)))
#     model.add(Bidirectional(LSTM(128)))
#     model.add(Dense(128, activation="relu"))
#     model.add(Dropout(.5))

#     model.add(Dense(512, activation="relu", input_shape=(n_vocabulary,)))
#     model.add(Dropout(.5))

#     model.add(Conv1D(100, 4, activation="relu"))
#     model.add(MaxPooling1D(pool_size=3))
#     model.add(Conv1D(100, 2, activation="relu"))
#     model.add(Dropout(.5))
#     model.add(Flatten())
#     model.add(Dense(300, activation="relu"))
    
    model.add(Dense(n_labels, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    
#     model.summary()
    return model

## Preparing the data

In [13]:
n_samples = None
x, y = shuffle(sequences, labels, random_state=seed, n_samples=n_samples)

## Cross-evaluating the model

In [14]:
reset_seed()

batch_size = 32
kfold = KFold(n_splits=5)
cv_scores = []

for train, test in kfold.split(x, y):
    model = init_model()
    es = EarlyStopping(patience=3, verbose=1, restore_best_weights=True)
    history = model.fit(x[train],
                        y[train],
                        batch_size=batch_size,
                        epochs=100,
                        verbose=1,
                        validation_split=.1,
                        callbacks=[es])
    
    y_pred_prob = model.predict(x[test], batch_size=batch_size, verbose=1)
    y_pred = np.round(y_pred_prob)
    
    scores = {}
    scores["accuracy"] = accuracy_score(y[test], y_pred)
    scores["F1 (macro)"] = f1_score(y[test], y_pred, average="macro")
    scores["F1 (micro)"] = f1_score(y[test], y_pred, average="micro")
    scores["LRAP"] = label_ranking_average_precision_score(y[test], y_pred_prob)
    scores["NDCG"] = ndcg_score(y[test], y_pred_prob)
    cv_scores.append(scores)
    
cv_scores_df = pd.DataFrame(cv_scores)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 00025: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 00019: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 00020: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 00022: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 00016: early stopping


## Metrics and their means

In [15]:
display(cv_scores_df)
print(cv_scores_df.mean())

Unnamed: 0,accuracy,F1 (macro),F1 (micro),LRAP,NDCG
0,0.661146,0.518046,0.874702,0.946022,0.969402
1,0.657126,0.509464,0.873381,0.94519,0.968798
2,0.660879,0.512373,0.874648,0.945694,0.969295
3,0.660373,0.521407,0.873952,0.945358,0.968911
4,0.652367,0.505775,0.872031,0.943322,0.968049


accuracy      0.658378
F1 (macro)    0.513413
F1 (micro)    0.873742
LRAP          0.945117
NDCG          0.968891
dtype: float64
