In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#use this https://keras.io/examples/nlp/text_classification_with_transformer/

# also https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/nlp/dl-transformers-keras.html

In [2]:
%cd drive/MyDrive/Sapienza/DL

/content/drive/MyDrive/Sapienza/DL


In [3]:
import pickle
import pandas as pd
import numpy as np
import json
import itertools
import gensim


path="SEMEVAL-2021-task6-corpus/data/"

with open('training_task1_labels.pkl', 'rb') as handle:
    train_labs = pickle.load(handle)

with open('training_task1_features_w2v.pkl', 'rb') as handle:
    train_feats = pickle.load(handle)

with open('training_task1_all_labels.pkl', 'rb') as handle:
    all_labels = pickle.load(handle)

with open('dev_task1_labels.pkl', 'rb') as handle:
    dev_labs = pickle.load(handle)

with open('dev_task1_features_w2v.pkl', 'rb') as handle:
    dev_feats = pickle.load(handle)

with open('test_task1_labels.pkl', 'rb') as handle:
    test_labs = pickle.load(handle)

with open('test_task1_features_w2v.pkl', 'rb') as handle:
    test_feats = pickle.load(handle)



In [4]:
n_classes=23

In [5]:
from sklearn.preprocessing import Normalizer
norm=Normalizer()
train_feats=norm.fit_transform(np.reshape(train_feats,(len(train_feats),-1)))
dev_feats=norm.transform(np.reshape(dev_feats,(len(dev_feats),-1)))
test_feats=norm.transform(np.reshape(test_feats,(len(test_feats),-1)))

In [6]:
train_feats=np.reshape(train_feats,(len(train_feats),32,256))
dev_feats=np.reshape(dev_feats,(len(dev_feats),32,256))
test_feats=np.reshape(test_feats,(len(test_feats),32,256))

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_feats, dev_feats, train_labs, dev_labs
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

In [9]:
X_train.shape

(688, 32, 256)

In [5]:
from tensorflow import keras
from tensorflow.keras import layers

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = layers.MultiHeadAttention( # add multi head attention layer
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = layers.Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = layers.LayerNormalization(epsilon=1e-6)(res)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x) # 1D convolution for the signals
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks): # make multiple transformer blocks
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_first")(x) # do global pooling to add dense network at the end
    for dim in mlp_units: 
        x = layers.Dense(dim, activation="relu",name='dense1')(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(n_classes, activation="softmax")(x)
    return keras.Model(inputs, outputs)



In [11]:
input_shape = X_train.shape[1:]

model = build_model( # build the transformer model
    input_shape,
    head_size=64,
    num_heads=8,
    ff_dim=64,
    num_transformer_blocks=4,
    mlp_units=[256],
    mlp_dropout=0.2,
    dropout=0.2,
)


In [12]:
model.compile(
    loss="categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=["categorical_accuracy"],
)
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 32, 256)]    0           []                               
                                                                                                  
 layer_normalization (LayerNorm  (None, 32, 256)     512         ['input_1[0][0]']                
 alization)                                                                                       
                                                                                                  
 multi_head_attention (MultiHea  (None, 32, 256)     526080      ['layer_normalization[0][0]',    
 dAttention)                                                      'layer_normalization[0][0]']    
                                                                                              

In [13]:
callbacks = [keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]

history=model.fit( # train the model
    X_train,
    y_train,
    epochs=200,
    batch_size=16,
    validation_data=(X_test,y_test),
    callbacks=callbacks,
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [11]:
with open('training_task1_labels.pkl', 'rb') as handle:
    train_labs = pickle.load(handle)

with open('training_task1_features.pkl', 'rb') as handle:
    train_feats = pickle.load(handle)

with open('training_task1_all_labels.pkl', 'rb') as handle:
    all_labels = pickle.load(handle)

with open('dev_task1_labels.pkl', 'rb') as handle:
    dev_labs = pickle.load(handle)

with open('dev_task1_features.pkl', 'rb') as handle:
    dev_feats = pickle.load(handle)

with open('test_task1_labels.pkl', 'rb') as handle:
    test_labs = pickle.load(handle)

with open('test_task1_features.pkl', 'rb') as handle:
    test_feats = pickle.load(handle)


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_feats)
X_train_counts.shape

(688, 3204)

In [13]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(688, 3204)

In [14]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(688, 3204)

In [15]:
dev_counts=count_vect.transform(dev_feats)
dev=tfidf_transformer.transform(dev_counts)
dev.shape

(63, 3204)

In [16]:
train_labs[:3]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 1., 0.]])

In [17]:
X_train, X_test, y_train, y_test = X_train_tfidf, dev, train_labs, dev_labs


In [18]:
X_train_tfidf.shape

(688, 3204)

In [19]:
X_train_tfidf=np.expand_dims(X_train_tfidf.toarray(),axis=2)
dev=np.expand_dims(dev.toarray(),axis=2)

In [20]:
X_train_tfidf.shape

(688, 3204, 1)

In [21]:
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

In [22]:
input_shape = X_train_tfidf.shape[1:]

model = build_model( # build the transformer model
    input_shape,
    head_size=8,
    num_heads=2,
    ff_dim=16,
    num_transformer_blocks=2,
    mlp_units=[256],
    mlp_dropout=0.2,
    dropout=0.2,
)


In [23]:
model.compile(
    loss="categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=["categorical_accuracy"],
)
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 3204, 1)]    0           []                               
                                                                                                  
 layer_normalization (LayerNorm  (None, 3204, 1)     2           ['input_1[0][0]']                
 alization)                                                                                       
                                                                                                  
 multi_head_attention (MultiHea  (None, 3204, 1)     113         ['layer_normalization[0][0]',    
 dAttention)                                                      'layer_normalization[0][0]']    
                                                                                              

In [None]:
callbacks = [keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]

history=model.fit( # train the model
    X_train_tfidf,
    train_labs,
    epochs=200,
    batch_size=4,
    validation_data=(dev,dev_labs),
    callbacks=callbacks,
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
 31/172 [====>.........................] - ETA: 12:56 - loss: 86151.4531 - categorical_accuracy: 0.1210