# Importación de librerías

In [502]:
import pandas as pd
import numpy as np
import re as re
import string
import nltk
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics 
from sklearn.svm import LinearSVC

import gensim
from gensim.parsing.preprocessing import remove_stopwords

[nltk_data] Downloading package wordnet to /home/dario/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Lectura de los datos

Leemos los datos que usaremos para entrenar nuestro modelo.

## Lectura

In [503]:
data = pd.read_csv('train.csv', usecols = ['text','target'])

### Primer vistazo

In [504]:
data

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


# Limpieza de los datos

Realizamos una limpieza del texto que se corresponde a los tweets para obtener un mejor resultado a la hora de procesar los datos

## Definicion de funciones auxiliares

In [505]:
#Eliminar numeros de un texto
def eliminar_numeros(text):
    return re.sub("\d+", "",text)

#Eliminar puntuacion
def eliminar_puntuacion(text):
    return re.sub(r'[^\w\s]','',text)

#Pasar letras a minusculas
def minusculas(text):
    return text.lower()

#Eliminar caracteres especiales
def eliminar_caracteres(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '',text)

#Eliminar urls
def eliminar_url(text):
    url_reg = re.compile(r'https?://\S+|www\.\S+')
    return url_reg.sub(r'',text)

#Eliminar stopwords
def eliminar_stopwords(text):
    return remove_stopwords(text)

## Aplicamos las funciones a nuestros datos

In [506]:
data['text'] = data['text'].apply(eliminar_numeros)
data['text'] = data['text'].apply(eliminar_puntuacion)
data['text'] = data['text'].apply(minusculas)
data['text'] = data['text'].apply(eliminar_caracteres)
data['text'] = data['text'].apply(eliminar_url)
data['text'] = data['text'].apply(eliminar_stopwords)

### Salida

In [507]:
data

Unnamed: 0,text,target
0,deeds reason earthquake allah forgive,1
1,forest near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,people receive wildfires evacuation orders cal...,1
4,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...
7608,giant cranes holding bridge collapse nearby ho...,1
7609,ariaahrary thetawniest control wild fires cali...,1
7610,m utckm s volcano hawaii httptcozdtoydebj,1
7611,police investigating ebike collided car little...,1


## Elimino duplicados

In [508]:
data[data.duplicated()]

Unnamed: 0,text,target
48,check httptcoroinsmejj httptcotjzjin httptcoyd...,0
115,ir icemoon aftershock httptcovampodgyw djicemo...,0
119,ir icemoon aftershock httptcothyzomvwu djicemo...,0
164,experts france begin examining airplane debris...,1
228,luka die annihilated alois trancy,0
...,...,...
7604,worldnews fallen powerlines glink tram update ...,1
7607,stormchase violent record breaking ef el reno ...,1
7609,ariaahrary thetawniest control wild fires cali...,1
7610,m utckm s volcano hawaii httptcozdtoydebj,1


Luego de una primera limpieza notamos que aparecen algunos registros duplicados, esto se debe a que el campo de texto variaba en algo en específico como por ejemplo el url o si la letra está en mayúscula o minúscula, etc.

In [509]:
data.drop_duplicates(inplace = True)

# Remuevo contracciones

In [510]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"thx"   : "thanks"
}


def eliminar_contracciones(text):
    resultado = ""
    for word in text.split():
        if word in contractions.keys():
            resultado += contractions[word]
        else:
            resultado += word
        resultado += " "
    return resultado

### Ejemplo

In [511]:
text = "what's your name?"
eliminar_contracciones(text)

'what is your name? '

In [512]:
data['text'] = data['text'].apply(eliminar_contracciones)

In [513]:
data

Unnamed: 0,text,target
0,deeds reason earthquake allah forgive,1
1,forest near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,people receive wildfires evacuation orders cal...,1
4,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...
7603,officials quarantine place alabama home possib...,1
7605,flip im walmart bomb evacuate stay tuned blow,1
7606,suicide bomber kills saudi security site mosqu...,1
7608,giant cranes holding bridge collapse nearby ho...,1


# Stemming

Stemming es un método para reducir una palabra a su stem. Aumenta el recall ya que, por ejemplo, ejemplo una consulta sobre "bibliotecas" también encuentra documentos en los que solo aparezca "bibliotecario" porque el stem de las dos palabras es el mismo.

In [514]:
def stemmer(text):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    
    stemmer = nltk.stem.PorterStemmer()
    
    return " ".join(stemmer.stem(token) for token in tokens) 

In [515]:
data['text'] = data['text'].apply(stemmer)

In [516]:
data

Unnamed: 0,text,target
0,deed reason earthquak allah forgiv,1
1,forest near la rong sask canada,1
2,resid ask shelter place notifi offic evacu she...,1
3,peopl receiv wildfir evacu order california,1
4,got sent photo rubi alaska smoke wildfir pour ...,1
...,...,...
7603,offici quarantin place alabama home possibl eb...,1
7605,flip im walmart bomb evacu stay tune blow,1
7606,suicid bomber kill saudi secur site mosqu reut...,1
7608,giant crane hold bridg collaps nearbi home htt...,1


# Lemmatization

La lematización es un proceso lingüístico que consiste en, dada una forma flexionada (es decir, en plural, en femenino, conjugada, etc), hallar el lema correspondiente. Hay varias palabras diferentes en representación de una misma palabra, por ejemplo sabemos que canto, cantas, canta, cantamos, cantáis, cantan son distintas formas (conjugaciones) de un mismo verbo (cantar)

In [517]:
def lemmatizer(text):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    
    lemmatizer=nltk.stem.WordNetLemmatizer()
    
    return " ".join(lemmatizer.lemmatize(token) for token in tokens)

In [518]:
data['text'] = data['text'].apply(lemmatizer)

In [519]:
data

Unnamed: 0,text,target
0,deed reason earthquak allah forgiv,1
1,forest near la rong sask canada,1
2,resid ask shelter place notifi offic evacu she...,1
3,peopl receiv wildfir evacu order california,1
4,got sent photo rubi alaska smoke wildfir pour ...,1
...,...,...
7603,offici quarantin place alabama home possibl eb...,1
7605,flip im walmart bomb evacu stay tune blow,1
7606,suicid bomber kill saudi secur site mosqu reut...,1
7608,giant crane hold bridg collaps nearbi home htt...,1


# Particionando los datos

Particionamos nuestros datos en dos ya que una parte de ellos será utilizada para entrenar a nuestro algorítmo y la otra será utilizada para probarlo. En nuestro caso las variables X respresentan a el texto de los tweets e Y el target asociado a los tweets. La intención de nuestro algorítmo es dado un determinado texto de un tweet(x) poder predecir su target(y)

In [520]:
X_train,X_test,y_train,y_test= train_test_split(data.text, data.target, test_size=0.1,random_state=1)

In [521]:
len(X_train)

6742

In [522]:
len(X_test)

750

# Un primer modelo usando One-Hot-encoding + LinearSVC

Los clasificadores y algoritmos de aprendizaje no pueden procesar directamente los documentos de texto en su forma original, ya que la mayoría de ellos espera vectores de características numéricas con un tamaño fijo en lugar de documentos de texto sin procesar con longitud variable.

Llamaremos token a las palabras que conforman el texto de los tweets. Tendremos una matriz cuyas columnas son los tokens que aparecen en todos los tweets y las filas cada uno de los tweets. Sean aij los elementos de la matriz, el elemento aij tomará el valor 1 si el token aparece en el tweet o 0 si no lo hace.

In [523]:
def convertir_en_tokens(texto):
    tokens = []
    for palabra in texto.split():
        if not palabra in tokens:
            tokens.append(palabra)
    return tokens

In [524]:
vectorizer = CountVectorizer( tokenizer = convertir_en_tokens, binary = True)
X_train_dtm = vectorizer.fit_transform(X_train)

In [525]:
X_test_dtm = vectorizer.transform(X_test)

In [526]:
clasificador = LinearSVC()
clasificador.fit(X_train_dtm,y_train)

LinearSVC()

### Predicciones sobre el set de training

In [527]:
predicciones = clasificador.predict(X_train_dtm)

In [528]:
metrics.accuracy_score(y_train,predicciones)

0.9961435775734203

### Predicciones sobre el set de test

In [529]:
predicciones = clasificador.predict(X_test_dtm)

In [530]:
metrics.accuracy_score(y_test,predicciones)

0.7533333333333333

# Naive Bayes

## Representando el texto de los tweets como una matriz de datos numéricos

Como una mejora del modelo anterior podemos crear una matriz cuyas columnas sean los tokens que aparecen en todos los tweets y las filas cada uno de los tweets. Sean aij los elementos de la matriz. El elemento aij representa la cantidad de veces que aparece el token j en el tweet i.

### Ejemplo

Tomemos tres tweets cualesquiera.

In [531]:
ejemplo = ['hola mi nombre mi federico', 'me gusta mi nombre', 'mi nombre nombre es federico']

In [532]:
vect = CountVectorizer(analyzer='word',binary=False)

In [533]:
vect.fit(ejemplo)

CountVectorizer()

In [534]:
vect.get_feature_names()

['es', 'federico', 'gusta', 'hola', 'me', 'mi', 'nombre']

In [535]:
ejemplo_dtm = vect.transform(ejemplo)
ejemplo_dtm

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [536]:
ejemplo_array = ejemplo_dtm.toarray()

In [537]:
pd.DataFrame(ejemplo_array,columns = vect.get_feature_names())

Unnamed: 0,es,federico,gusta,hola,me,mi,nombre
0,0,1,0,1,0,2,1
1,0,0,1,0,1,1,1
2,1,1,0,0,0,1,2


### Observación

Como la mayoría de los tweets utilizan un conjunto reducido de tokens con respecto al total de tokens con el que trabajaremos, se espera que la mayoría de los elementos de la matriz sean cero.

## Armando la matriz para nuestros datos

### Instanciando el vectorizador

In [538]:
vect = CountVectorizer(analyzer='word',binary=False)

### Obtenemos una lista con todos los tokens obtenidos de los tweets

In [539]:
vect.fit(X_train)

CountVectorizer()

In [540]:
vect.get_feature_names()[:10]

['aa',
 'aaaa',
 'aaaaaaallll',
 'aaaaaand',
 'aaarrrgghhh',
 'aaceorg',
 'aampb',
 'aampw',
 'aan',
 'aannnnd']

### Armamos la matriz

In [541]:
X_train_dtm = vect.transform(X_train)
X_train_dtm

<6742x16839 sparse matrix of type '<class 'numpy.int64'>'
	with 59055 stored elements in Compressed Sparse Row format>

In [542]:
X_train_array = X_train_dtm.toarray()

In [543]:
pd.DataFrame(X_train_array,columns = vect.get_feature_names())

Unnamed: 0,aa,aaaa,aaaaaaallll,aaaaaand,aaarrrgghhh,aaceorg,aampb,aampw,aan,aannnnd,...,zone,zonesthank,zonewolf,zoom,zotar,zouma,zourryart,zrnf,zxatheti,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6740,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


*** OBS: Notemos que la matriz tiene tantas filas como cantidad de tweets y tantas columnas como tokens. En este caso se encontraron 17413 tokens. Podemos notar como la mayoría de los elementos de la matriz son cero como mencionamos anteriormente ***

## Entrenando el modelo

Usaremos el clasificador multinomial Naive Bayes para realizar  la clasificación

In [544]:
nb = MultinomialNB()

Entrenamos el modelo

In [545]:
%time nb.fit(X_train_dtm, y_train)

CPU times: user 5.45 ms, sys: 73 µs, total: 5.52 ms
Wall time: 4.82 ms


MultinomialNB()

## Probamos el modelo

### Sobre el set de train

In [546]:
predicciones_train = nb.predict(X_train_dtm)

In [547]:
metrics.accuracy_score(y_train,predicciones_train)

0.915752002373183

### Sobre el set de test

In [548]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<750x16839 sparse matrix of type '<class 'numpy.int64'>'
	with 5212 stored elements in Compressed Sparse Row format>

In [549]:
predicciones_test = nb.predict(X_test_dtm)

In [550]:
metrics.accuracy_score(y_test,predicciones_test)

0.804

### Preparando el submit de Kaggle

### Leemos los datos

In [557]:
submit= pd.read_csv('test.csv')

In [558]:
submit

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


### Limpiamos los datos

In [559]:
submit['text'] = submit['text'].apply(eliminar_numeros)
submit['text'] = submit['text'].apply(eliminar_puntuacion)
submit['text'] = submit['text'].apply(minusculas)
submit['text'] = submit['text'].apply(eliminar_url)
submit['text'] = submit['text'].apply(remove_stopwords)
submit['text'] = submit['text'].apply(stemmer)
submit['text'] = submit['text'].apply(lemmatizer)
submit['text'] = submit['text'].apply(eliminar_contracciones)

In [560]:
submit

Unnamed: 0,id,keyword,location,text
0,0,,,happen terribl car crash
1,2,,,heard earthquak differ citi stay safe
2,3,,,forest spot pond gee flee street save
3,9,,,apocalyps light spokan wildfir
4,11,,,typhoon soudelor kill china taiwan
...,...,...,...,...
3258,10861,,,earthquak safeti lo angel ûò safeti fasten xrwn
3259,10865,,,storm ri wors hurrican cityampoth hardest hit ...
3260,10868,,,green line derail chicago httptcoutbxlcbiuy
3261,10874,,,meg issu hazard weather outlook hwo httptcoxrb...


### Predecimos

In [561]:
texts = vect.transform(submit['text'])
predicciones_kaggle = nb.predict(texts)
submit = pd.DataFrame(submit['id'])
submit['target'] = predicciones_kaggle
#submit.to_csv('SUBMITS/submission-bayes.csv',index=False)