### Data Retrieving

In [63]:
!pip3 install wordninja==2.0.0             # for splitting joined words
!pip3 install scikit-learn==0.22.2         # for one-hot encoding
!pip3 install lime==0.2.0                  # for explaining model predictions
!pip3 install tensorflow



In [64]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from lime.lime_text import LimeTextExplainer
import tensorflow as tf
# Check if any GPU is detected
print("Is GPU available: ", tf.test.is_gpu_available())
print("GPU(s) found: ")
print(tf.config.experimental.list_physical_devices('GPU'))

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is GPU available:  False
GPU(s) found: 
[]


In [29]:
data_path = os.getcwd().replace('script', 'data')
data = pd.read_csv(data_path + '/labelled_newscatcher_dataset.csv', sep=';')

In [30]:
data

Unnamed: 0,topic,link,domain,published_date,title,lang
0,SCIENCE,https://www.eurekalert.org/pub_releases/2020-0...,eurekalert.org,2020-08-06 13:59:45,A closer look at water-splitting's solar fuel ...,en
1,SCIENCE,https://www.pulse.ng/news/world/an-irresistibl...,pulse.ng,2020-08-12 15:14:19,"An irresistible scent makes locusts swarm, stu...",en
2,SCIENCE,https://www.express.co.uk/news/science/1322607...,express.co.uk,2020-08-13 21:01:00,Artificial intelligence warning: AI will know ...,en
3,SCIENCE,https://www.ndtv.com/world-news/glaciers-could...,ndtv.com,2020-08-03 22:18:26,Glaciers Could Have Sculpted Mars Valleys: Study,en
4,SCIENCE,https://www.thesun.ie/tech/5742187/perseid-met...,thesun.ie,2020-08-12 19:54:36,Perseid meteor shower 2020: What time and how ...,en
...,...,...,...,...,...,...
108769,NATION,https://www.vanguardngr.com/2020/08/pdp-govern...,vanguardngr.com,2020-08-08 02:40:00,PDP governors’ forum urges security agencies t...,en
108770,BUSINESS,https://www.patentlyapple.com/patently-apple/2...,patentlyapple.com,2020-08-08 01:27:12,"In Q2-20, Apple Dominated the Premium Smartpho...",en
108771,HEALTH,https://www.belfastlive.co.uk/news/health/coro...,belfastlive.co.uk,2020-08-12 17:01:00,Coronavirus Northern Ireland: Full breakdown s...,en
108772,ENTERTAINMENT,https://www.thenews.com.pk/latest/696364-paul-...,thenews.com.pk,2020-08-05 04:59:00,Paul McCartney details post-Beatles distress a...,en


In [31]:
data['topic'].value_counts()

HEALTH           15000
WORLD            15000
NATION           15000
ENTERTAINMENT    15000
SPORTS           15000
BUSINESS         15000
TECHNOLOGY       15000
SCIENCE           3774
Name: topic, dtype: int64

The topic Science is with much fewer records, may need to make this up with some other datasets.

In [32]:
# pickles
# data.to_pickle(data_path + "/original_data.pkl")
# data = pd.read_pickle(data_path + "/original_data.pkl")

### Transfer Learning with glove

In [38]:
glove_path = data_path + '/glove.6B.50d.txt'
glove_path

'/Users/zhuzeyu/Desktop/COMP3359/Project/COMP__3359__GROUP8/data/glove.6B.50d.txt'

In [41]:
################################################################################
# COMP3359 Artificial Intelligence Applications                                #
# Department of Computer Science, HKU                                          #
# Module 4 - Example: Classification of Text                                   #
# Utility functions for loading GloVe word embeddings                          #
#                                                                              #
# Reference: uillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer          #
# https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer #
################################################################################

from string import punctuation
from collections import defaultdict
import numpy as np

def load_embedding_from_disks(glove_filename, with_indexes=True):
    """
    Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
    else:
        word_to_embedding_dict = dict()

    
    with open(glove_filename, 'r') as glove_file:
        for (i, line) in enumerate(glove_file):
            
            split = line.split(' ')
            
            word = split[0]
            
            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation]
            )
            
            if with_indexes:
                word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation

    _WORD_NOT_FOUND = [0.0]* len(representation)  # Empty representation for unknown words.
    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(index_to_embedding_array + [_WORD_NOT_FOUND])
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict


def sentence_to_word_ids(sentence, word_to_index):
    """
    Note: there might be a better way to split sentences for GloVe.
    Please look at the documentation or open an issue to suggest a fix.
    """
    # Separating punctuation from words:
    for punctuation_character in punctuation:
        sentence = sentence.replace(punctuation_character, " {} ".format(punctuation_character))
    # Removing double spaces and lowercasing:
    sentence = sentence.replace("  ", " ").replace("  ", " ").lower().strip()
    # Splitting on every space:
    split_sentence = sentence.split(" ")
    # Converting to IDs:
    ids = [word_to_index[w.strip()] for w in split_sentence]
    return ids, split_sentence

In [157]:
word_to_index_dict, index_to_embedding_array = load_embedding_from_disks(glove_path, with_indexes=True)
index_to_embedding_array.shape

(400002, 50)

In [99]:
# Train dev split
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
X = data['title'].to_numpy()
y_beforelabel = data['topic'].to_numpy()
le_y = preprocessing.LabelEncoder()
y = le_y.fit_transform(y_beforelabel).reshape(-1, 1)
X_train, X_traindev, y_train, y_traindev = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
X_train_token = [ sentence_to_word_ids(x, word_to_index_dict)[0] for x in X_train ]
X_traindev_token = [ sentence_to_word_ids(x, word_to_index_dict)[0] for x in X_traindev ]

In [145]:
""" Construct our tf.data.Dataset Pipeline to Load Data """
# tf.data.Dataset pipeline is used to load our data and feed to model for training
# with higher efficiency.
# For more instructions on tf.data.Dataset pipelines, see:
#    https://www.tensorflow.org/guide/data
#    https://www.tensorflow.org/guide/data_performance

# Training dataset pipeline
train_ds_glove = tf.data.Dataset.from_generator( lambda: zip(X_train_token, y_train), output_types=(tf.int32,tf.int32))
train_ds_glove = train_ds_glove.shuffle(buffer_size=20000)
train_ds_glove = train_ds_glove.padded_batch(batch_size=500, padded_shapes=([None], [1]))
# Test dataset pipeline
traindev_ds_glove = tf.data.Dataset.from_generator( lambda: zip(X_traindev_token, y_traindev), output_types=(tf.int32,tf.int32))
traindev_ds_glove = traindev_ds_glove.padded_batch(batch_size=500, padded_shapes=([None], [1]))

In [180]:
""" Constructing Model with Pre-Trained Word Embedding """

# Get description of word embedding
vocab_size, embedding_dim = index_to_embedding_array.shape
print("Vocab Size: ", vocab_size)
print("Embedding Dim: ", embedding_dim)

# Construct embedding layer and use embedding vectors to set weights.
# Set trainable=False to freeze the weights to prevent weight to be 
# changed during training.
embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, 
                                            output_dim=embedding_dim,
                                            weights=[index_to_embedding_array],
                                            trainable=False)

# Construct model using the pre-trained embedding layer.
# This model learns by changing network weights in Dense layer.
# (no trainable weights in GlobalAveragePooling layer)
model_glove = tf.keras.Sequential([
    embedding_layer,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
print(model_glove.summary())

# Compile model
model_glove.compile(loss='binary_crossentropy',
                    optimizer=tf.keras.optimizers.Adam(1e-2),
                    metrics=['accuracy'])

# Train model
history = model_glove.fit(train_ds_glove, epochs=30,
                          validation_data=traindev_ds_glove, 
                          validation_steps=30)

Vocab Size:  400002
Embedding Dim:  50
Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 50)          20000100  
_________________________________________________________________
global_average_pooling1d_12  (None, 50)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 51        
Total params: 20,000,151
Trainable params: 51
Non-trainable params: 20,000,100
_________________________________________________________________
None
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30

The above is not converged at the moment

### Prediction

In [182]:
# prediction example
from scipy import stats
example = X_train[0]
print("Sentence: \n",
      example, "\n")
ids = sentence_to_word_ids(example, word_to_index_dict)[0]
print("Corresponding ids: \n",
      sentence, "\n")
print("Corresponding type: \n",
      le_y.inverse_transform(model_glove.predict(ids).flatten().astype(int)), "\n")
print("Final prediction: \n", 
      le_y.inverse_transform(stats.mode(model_glove.predict(ids).flatten().astype(int))[0]), "\n")

Sentence: 
 New evidence raised in stepmum's third trial in Melbourne for alleged incest 

Corresponding ids: 
 [50, 906, 1073, 6, 400001, 57, 1534, 245, 801, 6, 4179, 10, 1549, 25944] 

Corresponding type: 
 ['ENTERTAINMENT' 'ENTERTAINMENT' 'ENTERTAINMENT' 'ENTERTAINMENT'
 'ENTERTAINMENT' 'ENTERTAINMENT' 'ENTERTAINMENT' 'ENTERTAINMENT'
 'ENTERTAINMENT' 'ENTERTAINMENT' 'ENTERTAINMENT' 'ENTERTAINMENT'
 'ENTERTAINMENT' 'BUSINESS'] 

Final prediction: 
 ['ENTERTAINMENT'] 

