In [1]:
!pip install transformers



In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
#import plotly.express as px
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [3]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
#Procesamiento de datos
X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.33,
                                                  random_state=42)
FTest = test_df['text']
test_ids = test_df['id'] 

In [6]:
#Parametros
vocab_size =30522
epochs = 10
maxlen = 60
n_words = 500
embedding_dim = 16

In [7]:
#Tokenización
#tokenizer = Tokenizer(num_words = n_words, oov_token = '<OOV>')

#tokenizer.fit_on_texts(X_train)
#word_index = tokenizer.word_index

#sequences_train = tokenizer.texts_to_sequences(X_train)
#sequences_test = tokenizer.texts_to_sequences(X_test)
#sequences_FTest = tokenizer.texts_to_sequences(FTest)




#padded_train = pad_sequences(sequences_train, maxlen = maxlen, truncating = 'post')
#padded_test = pad_sequences(sequences_test, maxlen = maxlen)
#padded_FTest = pad_sequences(sequences_FTest, maxlen = maxlen)

In [8]:
def tokenizar(textos):
    secuencia = []
    for text in textos:
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        secuencia.append(indexed_tokens)
    return secuencia


In [9]:
secuencia_train = tokenizar(X_train.values)
secuencia_test = tokenizar(X_test.values)
secuencia_FTest = tokenizar(FTest.values)
padded_train = pad_sequences(secuencia_train, maxlen = maxlen, truncating = 'post')
padded_test = pad_sequences(secuencia_test, maxlen = maxlen)
padded_FTest = pad_sequences(secuencia_FTest, maxlen = maxlen)

In [25]:
(padded_FTest[0])

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,  101, 2074, 3047,
       1037, 6659, 2482, 5823,  102], dtype=int32)

In [39]:
len(secuencia_FTest[125])

43

In [10]:
#modelo LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          1953408   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,990,657
Trainable params: 1,990,657
Non-trainable params: 0
_________________________________________________________________


In [11]:
#entreno
model.fit(padded_train, y_train, epochs=epochs, validation_data = (padded_test,y_test))

Train on 5100 samples, validate on 2513 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa0ec133810>

In [12]:
preds = model.predict_classes(padded_FTest)
#preds = pd.Series(preds)
preds = pd.Series(list((x[0] for x in preds)))
preds
df_preds = pd.concat([test_ids,preds],axis=1)
df_preds.rename(columns = {0 : 'target'}, inplace=True)
df_preds.set_index('id', inplace=True)
df_preds.to_csv('BERT-LSTM.csv')
df_preds.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,1
9,1
11,1


In [13]:
#modelo LSTM bidireccional
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 64)            1953408   
_________________________________________________________________
bidirectional (Bidirectional (None, 60, 128)           66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 2,064,897
Trainable params: 2,064,897
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(padded_train, y_train, epochs=epochs, validation_data=(padded_test, y_test))

Train on 5100 samples, validate on 2513 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa0bc12bd90>

In [15]:
preds = model.predict_classes(padded_FTest)
#preds = pd.Series(preds)
preds = pd.Series(list((x[0] for x in preds)))
preds
df_preds = pd.concat([test_ids,preds],axis=1)
df_preds.rename(columns = {0 : 'target'}, inplace=True)
df_preds.set_index('id', inplace=True)
df_preds.to_csv('BERT-BiLSTM.csv')
df_preds.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,1
9,1
11,1


In [53]:
#modelo Conv1D
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=maxlen),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 60, 64)            1953408   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 56, 128)           41088     
_________________________________________________________________
global_average_pooling1d_5 ( (None, 128)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
Total params: 2,002,817
Trainable params: 2,002,817
Non-trainable params: 0
_________________________________________________________________


In [54]:
model.fit(padded_train, y_train, epochs=2, validation_data=(padded_test, y_test))

Train on 5100 samples, validate on 2513 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fa076d446d0>

In [55]:
preds = model.predict_classes(padded_FTest)
#preds = pd.Series(preds)
preds = pd.Series(list((x[0] for x in preds)))
preds
df_preds = pd.concat([test_ids,preds],axis=1)
df_preds.rename(columns = {0 : 'target'}, inplace=True)
df_preds.set_index('id', inplace=True)
df_preds.to_csv('BERT-Conv1D.csv')
df_preds.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,1
9,0
11,1


In [19]:
#modelo GRU
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 16)          488352    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                9600      
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 390       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 7         
Total params: 498,349
Trainable params: 498,349
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(padded_train, y_train, epochs=epochs, validation_data=(padded_test, y_test))

Train on 5100 samples, validate on 2513 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa0968caf90>

In [21]:
preds = model.predict_classes(padded_FTest)
#preds = pd.Series(preds)
preds = pd.Series(list((x[0] for x in preds)))
preds

0       0
1       1
2       1
3       1
4       1
       ..
3258    1
3259    1
3260    1
3261    1
3262    1
Length: 3263, dtype: int64

In [22]:
df_preds = pd.concat([test_ids,preds],axis=1)
df_preds.rename(columns = {0 : 'target'}, inplace=True)
df_preds.set_index('id', inplace=True)
df_preds.to_csv('BERT-GRU.csv')
df_preds.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,1
9,1
11,1


Tuning: 
1-  epochs = 10
    maxlen = 50
    n_words = 500
    embedding_dim = 16
Acuracy: 0.7441

2-  epochs = 6
    maxlen = 40
    n_words = 500
    embedding_dim = 16
Accuracy : 0.7656b

3-  epochs = 5
    maxlen = 40
    n_words = 300
    embedding_dim = 16
Accuracy: 0.7437
s