In [1]:
from numpy.random import seed 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import TimeDistributed, Dense, Dropout, Activation, Embedding, Conv1D, GlobalMaxPooling1D, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

#### Importo il dataset

In [2]:
DATA_FILE = 'data/spam.csv'
df = pd.read_csv(DATA_FILE, encoding = 'latin-1')
print(df.head)
print('-----------------------------------------------')
tags = df.v1
texts = df.v2

print(tags[:5])
print('------------------------------------------------')
print(texts[:5])

<bound method NDFrame.head of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  

#### Preprocessing del dataset

In [3]:
num_max = 1000
max_len = 100
le = LabelEncoder()
tags = le.fit_transform(tags)
tok = Tokenizer(num_words = num_max)
tok.fit_on_texts(texts)

cnn_texts_seq = tok.texts_to_sequences(texts)
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_len)

print(cnn_texts_mat[:4])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0  50 469 841 751
  657  64   8  89 121 349 147  67  58 144]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0  46 336 470   6]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0 

In [4]:
x = cnn_texts_mat
y = tags

In [5]:
def check_model(model, x, y):
    model.fit(x,y,batch_size=32,epochs=10,verbose=2,validation_split=0.2)

#### Strutturo una classe con architettura Conv1D con funzione Highway e addestro il modello

In [6]:
def get_hw_cnn_model():
    def get_conv_blocks(input):
        conv = Conv1D(32, 3, padding='same', activation='relu')(input)
        conv = Dropout(0.5)(conv)
        conv = Conv1D(32, 3, padding='same', activation='relu')(conv)
        conv = Dropout(0.5)(conv)
        conv = Conv1D(32, 3, padding='same', activation='relu')(conv)
        conv = Dropout(0.5)(conv)
        return conv

    def get_highway_block(input):
        H = get_conv_blocks(input)
        T = TimeDistributed(Dense(32, activation='sigmoid', bias_initializer=tf.keras.initializers.Constant(42.)))(input)
        return H * (1 - T) + input * T

    input = Input(shape=(cnn_texts_mat.shape[1:]))

    emb = Embedding(1000, 100, input_length=max_len)(input)
    conv = get_conv_blocks(emb)
    for i in range(15):
        conv = get_highway_block(conv)
    pool = GlobalMaxPooling1D()(conv)
    dense = Dense(1, activation='sigmoid')(pool)

    model = Model(inputs=input, outputs=dense)
    model.summary()
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['acc']
    )
    return model

m = get_hw_cnn_model()
check_model(m, cnn_texts_mat, tags)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 100, 100)             100000    ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 100, 32)              9632      ['embedding[0][0]']           
                                                                                                  
 dropout (Dropout)           (None, 100, 32)              0         ['conv1d[0][0]']              
                                                                                              