In [1]:
!pip install transformers



In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
#import plotly.express as px
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
import string
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
import numpy as np

In [3]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [7]:
#funciones auxiliares

def getHashtags(words):
    return ' '.join([item.lstrip('#') for item in words if item.startswith('#') and len(item) > 1])

def getMentions(words):
    return ' '.join([item.lstrip('@') for item in words if item.startswith('@') and len(item) > 1])

def getURLs(words):
    return ' '.join([item for item in words if item.startswith('http')])

def clean(text):
    
    exclude = set(string.punctuation)
    clean_text = ''.join(ch for ch in text if ch not in exclude)
    
    exclude = set(stopwords.words('english'))
    clean_text_list = clean_text.split(' ')
    clean_text = ' '.join(ch for ch in clean_text_list if ch not in exclude)

    return clean_text

#test_df['clean_text'] = test_df['text'].apply(clean)
#train_df['clean_text'] = train_df['text'].apply(clean)
#train_df.head()

In [8]:
#FEATURES
stop = stopwords.words('english')

def crear_features(df):
    
    #categóricas
    df['words'] = df['text'].apply(lambda x: x.split(' '))
    df['hashtags'] = df['words'].apply(getHashtags) #Obtengo los hashtags: "ht1 ht2 ht3 ..."
    df['mentions'] = df['words'].apply(getMentions) #Obtengo las menciones: "men1 men2 men3 ..."
    df['urls'] = df['words'].apply(getURLs) #Obtengo las urls "url1 url2 url3 ..."
    df['stop_words'] = df['text'].apply(lambda x: [w for w in str(x).lower().split() if w in stop])
    df['clean_text'] = df['text'].apply(clean)
    
    #numéricas
    df['words_count'] = df['words'].apply(lambda x: len(x))
    df['character_count'] = df['text'].str.len()
    df['mean_word_length'] = df['text'].apply(lambda x: (sum(len(w) for w in str(x).split()) / len(str(x).split())))
    df['punctuation_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    df['stop_words_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stop]))
    

In [9]:
crear_features(train_df)
crear_features(test_df)
train_df.head()

Unnamed: 0,id,keyword,location,text,target,words,hashtags,mentions,urls,stop_words,clean_text,words_count,character_count,mean_word_length,punctuation_count,stop_words_count
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #eart...",earthquake,,,"[our, are, the, of, this, all]",Our Deeds Reason earthquake May ALLAH Forgive us,13,69,4.384615,1,6
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask., Canada]",,,,[],Forest fire near La Ronge Sask Canada,7,38,4.571429,1,0
2,5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, 'shelter, in, plac...",,,,"[all, to, in, are, being, by, no, other, or, i...",All residents asked shelter place notified off...,22,133,5.090909,3,11
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #wildfires, evacuati...",wildfires,,,[in],13000 people receive wildfires evacuation orde...,9,65,7.125,2,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #Al...",Alaska wildfires,,,"[just, this, from, as, from, into, a]",Just got sent photo Ruby Alaska smoke wildfire...,17,88,4.5,2,7


In [10]:
df_x = train_df['text']
df_y = train_df['target']

df_test_x = test_df['text']

In [11]:
# gensim to pre process text
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [ 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [12]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.1, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(row).split(), [index]) for index, row in enumerate(df_x)]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row).split())
                                     for index, row in enumerate(df_x)]))

In [13]:
doc2vec_trf = Doc2VecTransformer()

doc2vec_features = doc2vec_trf.fit(df_x).transform(df_x)

test_doc2vec_features = doc2vec_trf.fit(df_test_x).transform(df_test_x)

100%|██████████| 7613/7613 [00:00<00:00, 122701.07it/s]
100%|██████████| 7613/7613 [00:00<00:00, 116072.22it/s]
100%|██████████| 7613/7613 [00:00<00:00, 91293.92it/s]
100%|██████████| 7613/7613 [00:00<00:00, 156298.12it/s]
100%|██████████| 7613/7613 [00:00<00:00, 157125.67it/s]
100%|██████████| 7613/7613 [00:00<00:00, 142014.44it/s]
100%|██████████| 7613/7613 [00:00<00:00, 151075.83it/s]
100%|██████████| 7613/7613 [00:00<00:00, 155770.49it/s]
100%|██████████| 7613/7613 [00:00<00:00, 124284.74it/s]
100%|██████████| 7613/7613 [00:00<00:00, 130232.71it/s]
100%|██████████| 7613/7613 [00:00<00:00, 151096.56it/s]
100%|██████████| 7613/7613 [00:00<00:00, 130724.28it/s]
100%|██████████| 7613/7613 [00:00<00:00, 119196.66it/s]
100%|██████████| 7613/7613 [00:00<00:00, 121608.99it/s]
100%|██████████| 7613/7613 [00:00<00:00, 51330.12it/s]
100%|██████████| 7613/7613 [00:00<00:00, 152557.67it/s]
100%|██████████| 7613/7613 [00:00<00:00, 141513.45it/s]
100%|██████████| 7613/7613 [00:00<00:00, 140320.08

In [14]:
doc2vec_features

matrix([[ 3.1924010e-03, -3.8122481e-03,  3.6248893e-03, ...,
          4.3330402e-03, -1.2836832e-03,  1.5753093e-03],
        [-3.2494471e-03,  4.8752627e-03, -3.8703883e-03, ...,
          2.4023489e-03,  3.3574684e-03,  3.3307329e-03],
        [ 1.3671657e-03,  2.9360943e-03,  1.6142406e-03, ...,
         -3.3283983e-03, -4.2602983e-03, -4.9863080e-03],
        ...,
        [ 3.8905744e-04,  2.3755492e-03,  1.0342433e-03, ...,
          9.9281245e-04,  3.2455947e-03, -4.0744939e-03],
        [-3.9040728e-03, -3.8678092e-03, -4.9740272e-03, ...,
         -2.0322482e-06, -5.2216457e-04,  3.6037080e-03],
        [-1.0670319e-03, -2.5798983e-03, -1.8928800e-03, ...,
          3.5258310e-03,  4.0741279e-03, -3.7900326e-03]], dtype=float32)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(doc2vec_features, df_y, test_size=.2, random_state=42)

In [16]:
#Parametros
vocab_size = 30522
epochs = 2
maxlen = 100
n_words = 500
test_size = 0.33
padding_texto = 60

test_ids = test_df['id'] 

def tokenizar(textos):
    secuencia = []
    for text in textos:
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        secuencia.appFailed to convert a NumPy array to a Tensor (Unsupported object type list).end(indexed_tokens)
    return secuencia


In [17]:
#X_train = np.array([np.array(lista) for lista in X_train])
#X_a = np.array([np.array(lista) for lista in X_test])
#padded_FTest = X_test = np.array([np.array(lista) for lista in FTest])
#padded_train = pad_sequences(X_train, maxlen = maxlen, truncating = 'post')
#padded_test = pad_sequences(X_test, maxlen = maxlen, truncating = 'post')


In [18]:
#secuencia_train = tokenizar(X_train.values)
#secuencia_test = tokenizar(X_test.values)
#secuencia_FTest = tokenizar(FTest.values)

padded_train = pad_sequences(X_train, maxlen = maxlen, truncating = 'post')
padded_test = pad_sequences(X_test, maxlen = maxlen)
#padded_FTest = pad_sequences(secuencia_FTest, maxlen = maxlen)

In [28]:
#modelo Conv1D
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length= maxlen),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 64)           3873408   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 96, 128)           41088     
_________________________________________________________________
global_average_pooling1d_3 ( (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 3,922,817
Trainable params: 3,922,817
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(padded_train, y_train, epochs=epochs, validation_data=(padded_test, y_test))

Epoch 1/2


ValueError: in user code:

    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/sequential.py:277 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/network.py:719 call
        convert_kwargs_to_constants=base_layer_utils.call_context().saving)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/network.py:888 _run_internal_graph
        output_tensors = layer(computed_tensors, **kwargs)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:886 __call__
        self.name)
    /home/joaquin/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:180 assert_input_compatibility
        str(x.shape.as_list()))

    ValueError: Input 0 of layer conv1d_3 is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: [None, 100, 100, 64]


In [None]:
preds_locales = model.predict_classes(padded_test)
preds_locales = pd.Series(list((x[0] for x in preds_locales)))
#preds_locales

In [None]:
print("F1 score:", f1_score(y_test, preds_locales))

In [None]:
preds = model.predict_classes(padded_FTest)
#preds = pd.Series(preds)
preds = pd.Series(list((x[0] for x in preds)))
preds
df_preds = pd.concat([test_ids,preds],axis=1)
df_preds.rename(columns = {0 : 'target'}, inplace=True)
df_preds.set_index('id', inplace=True)
df_preds.to_csv('BERT-Conv1D-features.csv')
df_preds.head()