In [1]:
!pip install transformers



In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
#import plotly.express as px
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
import string
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
import numpy as np

In [3]:
train_df = pd.read_csv('../train.csv')
test_df = pd.read_csv('../test.csv')

In [4]:
#funciones auxiliares

def getHashtags(words):
    return ' '.join([item.lstrip('#') for item in words if item.startswith('#') and len(item) > 1])

def getMentions(words):
    return ' '.join([item.lstrip('@') for item in words if item.startswith('@') and len(item) > 1])

def getURLs(words):
    return ' '.join([item for item in words if item.startswith('http')])

def clean(text):
    
    exclude = set(string.punctuation)
    clean_text = ''.join(ch for ch in text if ch not in exclude)
    
    exclude = set(stopwords.words('english'))
    clean_text_list = clean_text.split(' ')
    clean_text = ' '.join(ch for ch in clean_text_list if ch not in exclude)

    return clean_text

In [5]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


#Si tira error porque no lo reconoce ejecutar el siguiente codigo
import nltk
nltk.download('wordnet')

#Si tira error por no encontrar el 'averaged_perceptron_tagger' ejecutar el siguiente codigo
#nltk.download('averaged_perceptron_tagger')

def getWordNetPOSTag(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatizer(text):
    lemmatizer = WordNetLemmatizer()
    lemmaWords = []
    for w in text.split(" "):
        #lemmaWords.append(lemmatizer.lemmatize(w, getWordNetPOSTag(w)))
        lemmaWords.append(lemmatizer.lemmatize(w, pos='a')) 
        
    return " ".join(lemmaWords)

def porterStemmer(text):
    porter = PorterStemmer()
    stemmedWords = []
    for w in text.split(" "):
        stemmedWords.append(porter.stem(w)) 
        
    return " ".join(stemmedWords)

def snowballStemmer(text):
    porter = SnowballStemmer("english")
    stemmedWords = []
    for w in text.split(" "):
        stemmedWords.append(porter.stem(w)) 
        
    return " ".join(stemmedWords)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/santiagoaj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
#FEATURES
stop = stopwords.words('english')

def crear_features(df):
    
    #texto
    df['clean_text'] = df['text'].apply(clean)
    df['lemma_text'] = df['clean_text'].apply(lemmatizer)
    df['porter_stemmed_text'] = df['clean_text'].apply(porterStemmer)
    df['snowball_stemmed_text'] = df['clean_text'].apply(snowballStemmer)
    
    #categóricas
    df['words'] = df['text'].apply(lambda x: x.split(' '))
    df['hashtags'] = df['words'].apply(getHashtags) #Obtengo los hashtags: "ht1 ht2 ht3 ..."
    df['mentions'] = df['words'].apply(getMentions) #Obtengo las menciones: "men1 men2 men3 ..."
    df['urls'] = df['words'].apply(getURLs) #Obtengo las urls "url1 url2 url3 ..."
    df['stop_words'] = df['text'].apply(lambda x: [w for w in str(x).lower().split() if w in stop])
    
    #numéricas
    df['words_count'] = df['words'].apply(lambda x: len(x))
    df['character_count'] = df['text'].str.len()
    df['mean_word_length'] = df['text'].apply(lambda x: (sum(len(w) for w in str(x).split()) / len(str(x).split())))
    df['punctuation_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    df['stop_words_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stop]))
    

In [7]:
crear_features(train_df)
crear_features(test_df)

In [8]:
def trp(l, n):
    return l[:n] + [0]*(n-len(l))

In [9]:
def tokenizacion_de_texto(texto,longitud):
    marked_text = "[CLS] " + texto + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    vector = trp(indexed_tokens,longitud)
    return vector

In [10]:
train_df = train_df.fillna('')
test_df = test_df.fillna('')

In [11]:
#tokenización
def tokenizar_df(df):

    df['texto_tokenizado'] = df['text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['lemma_text_tokenizado'] = df['lemma_text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['porter_stemmed_text_tokenizado'] = df['porter_stemmed_text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['snowball_stemmed_text_tokenizado'] = df['snowball_stemmed_text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['clean_text_tokenizado'] = df['clean_text'].apply(lambda x: tokenizacion_de_texto(x,padding_texto))
    df['keyword_tokenizado'] = df['keyword'].apply(lambda x: tokenizacion_de_texto(x,5))
    df['location_tokenizado'] = df['location'].apply(lambda x: tokenizacion_de_texto(x,7))

    df['words_count'] = df['words_count'].apply(lambda x: [x])
    df['character_count'] = df['character_count'].apply(lambda x: [x])
    df['mean_word_length'] = df['mean_word_length'].apply(lambda x: [x])
    df['punctuation_count'] = df['punctuation_count'].apply(lambda x: [x])
    df['stop_words_count'] = df['stop_words_count'].apply(lambda x: [x])
    
    df['features_tokenizados'] = (  #TEXTO
                                    df['texto_tokenizado']
                                    # df['clean_text_tokenizado']
                                    #df['lemma_text_tokenizado']
                                    #df['porter_stemmed_text_tokenizado']
                                    #df['snowball_stemmed_text_tokenizado']
        
                                    #CATEGORICAS
                                    + df['keyword_tokenizado']
                                    + df['location_tokenizado']
                                  
                                    #NUMERICAS
                                    #+ df['mean_word_length']
                                    #+ df['stop_words_count']
                                    )

In [12]:
#Parametros
vocab_size = 30522
epochs = 2
maxlen = 72
n_words = 500
test_size = 0.25
padding_texto = 60

In [13]:
tokenizar_df(train_df)
tokenizar_df(test_df)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential

In [15]:
def create_model(optimizer="adam", dropout=0.2, init='uniform', nbr_features=2500, dense_nparams=256):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 128, input_length= maxlen),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Conv1D(256, 3, activation='relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(dense_nparams, activation='relu', input_shape=(nbr_features,), kernel_initializer=init),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=["accuracy"])
    return model

In [16]:
keras_estimator = KerasClassifier(build_fn=create_model, verbose=1)

In [31]:
# define the grid search parameters
param_grid = {
    'kc__epochs': [10],
    'kc__dense_nparams': [32],
    'kc__init': [ 'uniform'], 
    'kc__batch_size':[16],
    'kc__optimizer':['Adam'],
    'kc__dropout': [0.2]
}

In [34]:
estimator = Pipeline([("kc", keras_estimator)])

In [35]:
kfold_splits = 5
grid = GridSearchCV(estimator=estimator,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=kfold_splits,
                    param_grid=param_grid,)

In [36]:
X = train_df.drop(['target'], axis=1, inplace=False)

In [37]:
y = train_df.target

In [38]:
X = np.array([np.array(lista) for lista in train_df['features_tokenizados']])
padded_train = pad_sequences(X, maxlen = maxlen, truncating = 'post')

In [39]:
grid_result = grid.fit(padded_train, y) #callbacks=[tbCallBack]

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.


KeyboardInterrupt: 

In [42]:
optimizer="adam"
dropout=0.2
init='uniform'
nbr_features=2500
dense_nparams=256

In [51]:
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 128, input_length= maxlen),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Conv1D(256, 3, activation='relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(dense_nparams, activation='relu', input_shape=(nbr_features,), kernel_initializer=init),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=["accuracy"])
    
model.fit(X, y)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 72, 128)           3906816   
_________________________________________________________________
dropout_6 (Dropout)          (None, 72, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 70, 256)           98560     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                

In [49]:
X_test = np.array([np.array(lista) for lista in test_df['features_tokenizados']])
padded_test = pad_sequences(X_test, maxlen = maxlen, truncating = 'post')

In [50]:
preds = model.predict_classes(padded_test)

preds = pd.Series(list((x[0] for x in preds)))
preds

test_ids = test_df['id']

df_preds = pd.concat([test_ids,preds],axis=1)
df_preds.rename(columns = {0 : 'target'}, inplace=True)
df_preds.set_index('id', inplace=True)
df_preds.to_csv('Conv1d-sin-cv.csv')
df_preds.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
