In [94]:
!pip install transformers



In [61]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
#import plotly.express as px
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
import string
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
import numpy as np

In [81]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [97]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [82]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [99]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [83]:
#funciones auxiliares

def getHashtags(words):
    return ' '.join([item.lstrip('#') for item in words if item.startswith('#') and len(item) > 1])

def getMentions(words):
    return ' '.join([item.lstrip('@') for item in words if item.startswith('@') and len(item) > 1])

def getURLs(words):
    return ' '.join([item for item in words if item.startswith('http')])

def clean(text):
    
    exclude = set(string.punctuation)
    clean_text = ''.join(ch for ch in text if ch not in exclude)
    
    exclude = set(stopwords.words('english'))
    clean_text_list = clean_text.split(' ')
    clean_text = ' '.join(ch for ch in clean_text_list if ch not in exclude)

    return clean_text

#test_df['clean_text'] = test_df['text'].apply(clean)
#train_df['clean_text'] = train_df['text'].apply(clean)
#train_df.head()

In [84]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


#Si tira error porque no lo reconoce ejecutar el siguiente codigo
import nltk
nltk.download('wordnet')

#Si tira error por no encontrar el 'averaged_perceptron_tagger' ejecutar el siguiente codigo
#nltk.download('averaged_perceptron_tagger')

def getWordNetPOSTag(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatizer(text):
    lemmatizer = WordNetLemmatizer()
    lemmaWords = []
    for w in text.split(" "):
        #lemmaWords.append(lemmatizer.lemmatize(w, getWordNetPOSTag(w)))
        lemmaWords.append(lemmatizer.lemmatize(w, pos='a')) 
        
    return " ".join(lemmaWords)

def porterStemmer(text):
    porter = PorterStemmer()
    stemmedWords = []
    for w in text.split(" "):
        stemmedWords.append(porter.stem(w)) 
        
    return " ".join(stemmedWords)

def snowballStemmer(text):
    porter = SnowballStemmer("english")
    stemmedWords = []
    for w in text.split(" "):
        stemmedWords.append(porter.stem(w)) 
        
    return " ".join(stemmedWords)

[nltk_data] Downloading package wordnet to /home/nicolas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [85]:
#FEATURES
stop = stopwords.words('english')

def crear_features(df):
    
    #texto
    df['clean_text'] = df['text'].apply(clean)
    df['lemma_text'] = df['clean_text'].apply(lemmatizer)
    df['porter_stemmed_text'] = df['clean_text'].apply(porterStemmer)
    df['snowball_stemmed_text'] = df['clean_text'].apply(snowballStemmer)
    
    #categóricas
    df['words'] = df['text'].apply(lambda x: x.split(' '))
    df['hashtags'] = df['words'].apply(getHashtags) #Obtengo los hashtags: "ht1 ht2 ht3 ..."
    df['mentions'] = df['words'].apply(getMentions) #Obtengo las menciones: "men1 men2 men3 ..."
    df['urls'] = df['words'].apply(getURLs) #Obtengo las urls "url1 url2 url3 ..."
    df['stop_words'] = df['text'].apply(lambda x: [w for w in str(x).lower().split() if w in stop])
    
    #numéricas
    df['words_count'] = df['words'].apply(lambda x: len(x))
    df['character_count'] = df['text'].str.len()
    df['mean_word_length'] = df['text'].apply(lambda x: (sum(len(w) for w in str(x).split()) / len(str(x).split())))
    df['punctuation_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    df['stop_words_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stop]))
    

In [86]:
crear_features(train_df)
crear_features(test_df)
train_df.head()

Unnamed: 0,id,keyword,location,text,target,clean_text,lemma_text,porter_stemmed_text,snowball_stemmed_text,words,hashtags,mentions,urls,stop_words,words_count,character_count,mean_word_length,punctuation_count,stop_words_count
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds Reason earthquake May ALLAH Forgive us,Our Deeds Reason earthquake May ALLAH Forgive us,our deed reason earthquak may allah forgiv us,our deed reason earthquak may allah forgiv us,"[Our, Deeds, are, the, Reason, of, this, #eart...",earthquake,,,"[our, are, the, of, this, all]",13,69,4.384615,1,6
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,Forest fire near La Ronge Sask Canada,forest fire near La rong sask canada,forest fire near la rong sask canada,"[Forest, fire, near, La, Ronge, Sask., Canada]",,,,[],7,38,4.571429,1,0
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked shelter place notified off...,All residents asked shelter place notified off...,all resid ask shelter place notifi offic No ev...,all resid ask shelter place notifi offic no ev...,"[All, residents, asked, to, 'shelter, in, plac...",,,,"[all, to, in, are, being, by, no, other, or, i...",22,133,5.090909,3,11
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,13000 people receive wildfires evacuation orde...,13000 peopl receiv wildfir evacu order califor...,13000 peopl receiv wildfir evacu order califor...,"[13,000, people, receive, #wildfires, evacuati...",wildfires,,,[in],9,65,7.125,2,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent photo Ruby Alaska smoke wildfire...,Just got sent photo Ruby Alaska smoke wildfire...,just got sent photo rubi alaska smoke wildfir ...,just got sent photo rubi alaska smoke wildfir ...,"[Just, got, sent, this, photo, from, Ruby, #Al...",Alaska wildfires,,,"[just, this, from, as, from, into, a]",17,88,4.5,2,7


In [87]:
def trp(l, n):
    return l[:n] + [0]*(n-len(l))


In [88]:
def tokenizacion_de_texto(texto,longitud):
    marked_text = "[CLS] " + texto + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    vector = trp(indexed_tokens,longitud)
    return vector
    #return indexed_tokens

In [89]:
train_df = train_df.fillna('')
test_df = test_df.fillna('')

In [90]:
#Parametros
vocab_size = 30522
epochs = 2
maxlen = 72
n_words = 500
test_size = 0.25
padding_texto = 60

In [91]:
#tokenización
def tokenizar_df(df):

    df['texto_tokenizado'] = df['text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['lemma_text_tokenizado'] = df['lemma_text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['porter_stemmed_text_tokenizado'] = df['porter_stemmed_text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['snowball_stemmed_text_tokenizado'] = df['snowball_stemmed_text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['clean_text_tokenizado'] = df['clean_text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['keyword_tokenizado'] = df['keyword'].apply(lambda x: tokenizacion_de_texto(x,5))
    df['location_tokenizado'] = df['location'].apply(lambda x: tokenizacion_de_texto(x,7))

    df['words_count'] = df['words_count'].apply(lambda x: [x])
    df['character_count'] = df['character_count'].apply(lambda x: [x])
    df['mean_word_length'] = df['mean_word_length'].apply(lambda x: [x])
    df['punctuation_count'] = df['punctuation_count'].apply(lambda x: [x])
    df['stop_words_count'] = df['stop_words_count'].apply(lambda x: [x])
    
    df['features_tokenizados'] = (  #TEXTO
                                    df['texto_tokenizado']
                                    # df['clean_text_tokenizado']
                                    #df['lemma_text_tokenizado']
                                    #df['porter_stemmed_text_tokenizado']
                                    #df['snowball_stemmed_text_tokenizado']
        
                                    #CATEGORICAS
                                    + df['keyword_tokenizado']
                                    + df['location_tokenizado']
                                  
                                    #NUMERICAS
                                    #+ df['mean_word_length']
                                    #+ df['stop_words_count']
                                    )

In [92]:
tokenizar_df(train_df)
tokenizar_df(test_df)
train_df.head()

Unnamed: 0,id,keyword,location,text,target,clean_text,lemma_text,porter_stemmed_text,snowball_stemmed_text,words,...,punctuation_count,stop_words_count,texto_tokenizado,lemma_text_tokenizado,porter_stemmed_text_tokenizado,snowball_stemmed_text_tokenizado,clean_text_tokenizado,keyword_tokenizado,location_tokenizado,features_tokenizados
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds Reason earthquake May ALLAH Forgive us,Our Deeds Reason earthquake May ALLAH Forgive us,our deed reason earthquak may allah forgiv us,our deed reason earthquak may allah forgiv us,"[Our, Deeds, are, the, Reason, of, this, #eart...",...,[1],[6],"[101, 2256, 15616, 2024, 1996, 3114, 1997, 202...","[101, 2256, 15616, 3114, 8372, 2089, 16455, 96...","[101, 2256, 15046, 3114, 3011, 16211, 2243, 20...","[101, 2256, 15046, 3114, 3011, 16211, 2243, 20...","[101, 2256, 15616, 3114, 8372, 2089, 16455, 96...","[101, 102, 0, 0, 0]","[101, 102, 0, 0, 0, 0, 0]","[101, 2256, 15616, 2024, 1996, 3114, 1997, 202..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,Forest fire near La Ronge Sask Canada,forest fire near La rong sask canada,forest fire near la rong sask canada,"[Forest, fire, near, La, Ronge, Sask., Canada]",...,[1],[0],"[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...","[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...","[101, 3224, 2543, 2379, 2474, 6902, 2290, 2187...","[101, 3224, 2543, 2379, 2474, 6902, 2290, 2187...","[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...","[101, 102, 0, 0, 0]","[101, 102, 0, 0, 0, 0, 0]","[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187..."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked shelter place notified off...,All residents asked shelter place notified off...,all resid ask shelter place notifi offic No ev...,all resid ask shelter place notifi offic no ev...,"[All, residents, asked, to, 'shelter, in, plac...",...,[3],[11],"[101, 2035, 3901, 2356, 2000, 1005, 7713, 1999...","[101, 2035, 3901, 2356, 7713, 2173, 19488, 373...","[101, 2035, 24501, 3593, 3198, 7713, 2173, 202...","[101, 2035, 24501, 3593, 3198, 7713, 2173, 202...","[101, 2035, 3901, 2356, 7713, 2173, 19488, 373...","[101, 102, 0, 0, 0]","[101, 102, 0, 0, 0, 0, 0]","[101, 2035, 3901, 2356, 2000, 1005, 7713, 1999..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,13000 people receive wildfires evacuation orde...,13000 peopl receiv wildfir evacu order califor...,13000 peopl receiv wildfir evacu order califor...,"[13,000, people, receive, #wildfires, evacuati...",...,[2],[1],"[101, 2410, 1010, 2199, 2111, 4374, 1001, 3748...","[101, 19527, 2692, 2111, 4374, 3748, 26332, 13...","[101, 19527, 2692, 21877, 7361, 2140, 28667, 7...","[101, 19527, 2692, 21877, 7361, 2140, 28667, 7...","[101, 19527, 2692, 2111, 4374, 3748, 26332, 13...","[101, 102, 0, 0, 0]","[101, 102, 0, 0, 0, 0, 0]","[101, 2410, 1010, 2199, 2111, 4374, 1001, 3748..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent photo Ruby Alaska smoke wildfire...,Just got sent photo Ruby Alaska smoke wildfire...,just got sent photo rubi alaska smoke wildfir ...,just got sent photo rubi alaska smoke wildfir ...,"[Just, got, sent, this, photo, from, Ruby, #Al...",...,[2],[7],"[101, 2074, 2288, 2741, 2023, 6302, 2013, 1009...","[101, 2074, 2288, 2741, 6302, 10090, 7397, 561...","[101, 2074, 2288, 2741, 6302, 14548, 2072, 739...","[101, 2074, 2288, 2741, 6302, 14548, 2072, 739...","[101, 2074, 2288, 2741, 6302, 10090, 7397, 561...","[101, 102, 0, 0, 0]","[101, 102, 0, 0, 0, 0, 0]","[101, 2074, 2288, 2741, 2023, 6302, 2013, 1009..."


In [93]:
#split
X_train, X_test, y_train, y_test = train_test_split(train_df['features_tokenizados'], train_df['target'], test_size=test_size,
                                                  random_state=42)
FTest = test_df['features_tokenizados']
test_ids = test_df['id'] 

def tokenizar(textos):
    secuencia = []
    for text in textos:
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        secuencia.appFailed to convert a NumPy array to a Tensor (Unsupported object type list).end(indexed_tokens)
    return secuencia


In [94]:
X_train = np.array([np.array(lista) for lista in X_train])
X_a = np.array([np.array(lista) for lista in X_test])
padded_FTest = X_test = np.array([np.array(lista) for lista in FTest])
padded_train = pad_sequences(X_train, maxlen = maxlen, truncating = 'post')
padded_test = pad_sequences(X_a, maxlen = maxlen, truncating = 'post')


In [112]:
#secuencia_train = tokenizar(X_train.values)
#secuencia_test = tokenizar(X_test.values)
#secuencia_FTest = tokenizar(FTest.values)

#padded_train = pad_sequences(secuencia_train, maxlen = maxlen, truncating = 'post')
#padded_test = pad_sequences(secuencia_test, maxlen = maxlen)
#padded_FTest = pad_sequences(secuencia_FTest, maxlen = maxlen)

In [95]:
#modelo Conv1D
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 128, input_length= maxlen),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Conv1D(256, 3, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 72, 128)           3906816   
_________________________________________________________________
dropout_28 (Dropout)         (None, 72, 128)           0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 70, 256)           98560     
_________________________________________________________________
global_max_pooling1d_11 (Glo (None, 256)               0         
_________________________________________________________________
dense_31 (Dense)             (None, 32)                8224      
_________________________________________________________________
dropout_29 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_32 (Dense)             (None, 1)               

In [96]:
model.fit(padded_train, y_train, epochs=2, validation_data=(padded_test, y_test))

Train on 5709 samples, validate on 1904 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f1a74a1f6d0>

In [97]:
preds_locales = model.predict_classes(padded_test)
preds_locales = pd.Series(list((x[0] for x in preds_locales)))
#preds_locales

In [98]:
print("F1 score:", f1_score(y_test, preds_locales))

F1 score: 0.7553816046966734


In [99]:
preds = model.predict_classes(padded_FTest)
#preds = pd.Series(preds)
preds = pd.Series(list((x[0] for x in preds)))
preds
df_preds = pd.concat([test_ids,preds],axis=1)
df_preds.rename(columns = {0 : 'target'}, inplace=True)
df_preds.set_index('id', inplace=True)
df_preds.to_csv('BERT-Conv1D-features.csv')
df_preds.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
