## [75.06 / 95.58] Organización de Datos
## Trabajo Práctico 2: Competencia de Machine Learning
### Grupo 18: DATAVID-20

* 102732 - Bilbao, Manuel
* 101933 - Karagoz, Filyan
* 98684 - Markarian, Darío
* 100901 - Stroia, Lautaro

## 1. Importación general de librerias y set-up de datos.

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re as re

#Para instalar NLTK
#! pip3 install nltk
import nltk
#from nltk.corpus import stopwords
#from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
#from nltk.stem.wordnet import WordNetLemmatizer

#Para instalar gensim
#! pip3 install gensim
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#Instalar tensorflow
#!pip3 install tensorflow
import tensorflow
from tensorflow import keras 
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_rows = None #mostrar todas las filas del df
%matplotlib inline
plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
plt.rcParams['figure.figsize'] = (20, 10)
sns.set(style="whitegrid") # seteando tipo de grid en seaborn
pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs


In [2]:
test_set = pd.read_csv('test.csv')
train_set = pd.read_csv('train.csv')
train_set.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Limpieza de datos
**Definicion de funciones auxiliares**

In [3]:
#Eliminar numeros de un texto
def eliminar_numeros(text):
    return re.sub("\d+", "",text)

#Eliminar puntuacion
def eliminar_puntuacion(text):
    return re.sub(r'[^\w\s]','',text)

#Pasar letras a minusculas
def minusculas(text):
    return text.lower()

#Eliminar caracteres especiales
def eliminar_caracteres(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '',text)

#Eliminar urls
def eliminar_url(text):
    url_reg = re.compile(r'https?://\S+|www\.\S+')
    return url_reg.sub(r'',text)

## 2. Word Embeddings para NLP

### Obtenemos un set de vectores de palabras pre-entrenados de:
#### https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

In [5]:
#! wget -P ~ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-07-08 18:58:15--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.82.22
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.82.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1,5G) [application/x-gzip]
Saving to: ‘/home/lauti/GoogleNews-vectors-negative300.bin.gz’


2020-07-08 19:04:56 (3,93 MB/s) - ‘/home/lauti/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [4]:
word2vec = KeyedVectors.load_word2vec_format('~/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [5]:
#Separamos la variable a predecir
Y = train_set['target']
train_set = train_set.drop('target',axis=1)

#Concateno sets de test y train
data = pd.concat([train_set,test_set],axis=0).reset_index(drop=True)

**Procesado de datos -> aplicar funciones de limpieza**

In [35]:
data['text'] = data['text'].apply(lambda x: eliminar_puntuacion(x))
data['text'] = data['text'].apply(lambda x: minusculas(x))
data['text'] = data['text'].apply(lambda x: eliminar_numeros(x))
data['text'] = data['text'].apply(lambda x: eliminar_caracteres(x))
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
data['text'] = data['text'].apply(lambda x: eliminar_url(x))

text_data = data['text']

In [36]:
#Separo los tweets en palabras (tokens)
tokenizer = Tokenizer()
#Actualiza vocab interno basado en una lista de textos (text_data)
tokenizer.fit_on_texts(text_data) 
word2index = tokenizer.word_index

In [37]:
#Cantidad de palabras distintas
len(word2index)

27497

In [38]:
#Armo secuencias de enteros a partir de los tokens
secuencias = tokenizer.texts_to_sequences(text_data)
secuencias[:5]

[[5334, 627, 116, 1653, 3357],
 [56, 88, 590, 7979, 7980, 1172],
 [1321, 1245, 1945, 546, 7981, 1515, 131, 1945, 546, 1408, 1017],
 [7, 4076, 1055, 131, 1408, 22],
 [20, 1246, 150, 5335, 2172, 137, 1055, 7982, 84]]

In [39]:
#Podemos observar que los vectores formados son de distinta longitud, por lo que habia
#que llevarlos a todos a la misma dimension agregando un padding
data_same_dim = sequence.pad_sequences(secuencias)
data_same_dim[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 5334,  627,  116, 1653, 3357],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,   56,   88,  590, 7979, 7980, 1172],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1321,
        1245, 1945,  546, 7981, 1515,  131, 1945,  546, 1408, 1017],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    7, 4076, 1055,  131, 1408,   22],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,   20, 1246,  150, 5335, 2172,  137, 1055, 7982,   84]],
      dtype=int32)

In [40]:
#dimension de la matriz. Acordar que tenemos ambos datasets (test y train) concatenados, por eso
#tantas filas
data_same_dim.shape

(10876, 21)

In [41]:
padded_test = data_same_dim[test_set.shape[0]:]
padded_train = data_same_dim[:test_set.shape[0]]

In [54]:
#Armo matriz de embeddings
matrix = np.zeros((len(word2index)+1,300))

for word, i in word2index.items():
    if word in list(word2vec.vocab):
        embedding = list(word2vec[word])
        if embedding is not None:
            matrix[i] = embedding
            
#Entra en loop infinito

KeyboardInterrupt: 