In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv('clean_train.csv')
test = pd.read_csv('test.csv')
train['source'] = 'train'
test['source'] = 'test'
merged = pd.concat([train, test])

In [3]:
def getTrainSet():
    return merged.loc[merged['source'] == 'train']

def getTestSet():
    return merged.loc[merged['source'] == 'test']

### Agregando caracteristicas del texto

In [4]:
def getHashtags(words):
    return ' '.join([item.lstrip('#') for item in words if item.startswith('#') and len(item) > 1])

def getMentions(words):
    return ' '.join([item.lstrip('@') for item in words if item.startswith('@') and len(item) > 1])

def getURLs(words):
    return ' '.join([item for item in words if item.startswith('http')])

#import nltk
#nltk.download('stopwords')
#Elimina las stopwords, urls, hashtags y menciones
def removeSpecialWords(words):
    stop = stopwords.words('english')
    return ' '.join([item.lstrip('#').lstrip('@') for item in words if item not in stop and not item.startswith('#') and not item.startswith('@') and not item.startswith('http')])

import re
regex = re.compile('[^a-zA-Z]')
def getPunctuationSigns(words):
    return ' '.join([item for item in words if len(regex.sub('', item)) == 0])

def removePunctuationSigns(words):
    return [item for item in words if len(regex.sub('', item)) > 0]

def arrayToLower(arr):
    return [item.lower() for item in arr]

def getWordsLengthAVG(text):
    words = text.split()
    
    if len(words) == 0:
        return 0
    
    total = 0
    for w in words:
        total += len(w)
        
    return total/len(words)

In [5]:
tweet_tokenizer = TweetTokenizer()

merged['words'] = merged['text'].apply(tweet_tokenizer.tokenize).apply(arrayToLower) #Tokenizacion
merged['real_words'] = merged['words'].apply(removePunctuationSigns) #Elimino los signos de puntuacion
merged['clean_text'] = merged['real_words'].apply(removeSpecialWords) #Elimino las stopwords, hashtags, menciones y urls

merged['punctuation_signs'] = merged['words'].apply(getPunctuationSigns) #Obtengo los signos de puntuacion "s1 s2 s2 ..."
merged['hashtags'] = merged['words'].apply(getHashtags) #Obtengo los hashtags: "ht1 ht2 ht3 ..."
merged['mentions'] = merged['words'].apply(getMentions) #Obtengo las menciones: "men1 men2 men3 ..."
merged['urls'] = merged['words'].apply(getURLs) #Obtengo las urls "url1 url2 url3 ..."

In [6]:
merged['entities_count'] = merged['words'].apply(lambda x: len(x)) #Conteo de todas las cosas que hay (incluye TODO)
merged['words_count'] = merged['clean_text'].apply(lambda x: len(x.split())) #Conteo de palabras
merged['punctuations_signs_count'] = merged['punctuation_signs'].apply(lambda x: len(x.split())) #Conteo de puntuaciones
merged['hashtags_count'] = merged['hashtags'].apply(lambda x: len(x.split())) #Conteo de hashtags
merged['mentions_count'] = merged['mentions'].apply(lambda x: len(x.split())) #Conteo de menciones
merged['urls_count'] = merged['urls'].apply(lambda x: len(x.split())) #Conteo de urls

#Conteo de stopwords
merged['stopwords_count'] = merged.entities_count - merged.words_count - merged.punctuations_signs_count - merged.hashtags_count - merged.mentions_count - merged.urls_count

In [7]:
merged['words_length_avg'] = merged['clean_text'].apply(getWordsLengthAVG)

merged['punctuations_ratio'] = merged.punctuations_signs_count/merged.entities_count
merged['hashtags_ratio'] = merged.hashtags_count/merged.entities_count
merged['mentions_ratio'] = merged.mentions_count/merged.entities_count
merged['urls_ratio'] = merged.urls_count/merged.entities_count
merged['stopwords_ratio'] = merged.stopwords_count/merged.entities_count

merged['real_words_ratio'] = merged.words_count/merged.entities_count
merged['special_entities_ratio'] = 1 - merged.real_words_ratio

#### Graficos de distribucion para estas columnas agregadas

In [8]:
import plotly.figure_factory as ff

In [11]:
columns = ['entities_count', 'words_count', 'punctuations_signs_count', 'hashtags_count', 'mentions_count', 'urls_count',
          'stopwords_count', 'words_length_avg', 'punctuations_ratio', 'hashtags_ratio', 'mentions_ratio', 'urls_ratio',
          'stopwords_ratio', 'real_words_ratio', 'special_entities_ratio']
train = getTrainSet()

#Props
colors = ["#57A773", "#EE6352"]
bin_size = 0.5

i = 0
for c in columns:
    data = [train.loc[train['target'] == 0][c].values, train.loc[train['target'] == 1][c].values]
    fig = ff.create_distplot(data, ["No desastre", "Desastre"], bin_size=bin_size, show_rug=False, colors=colors)
    fig.update_layout(title=c, title_x=0.5, autosize=False, width=700, height=350)
    
    fig.show()

## Aplicando mean encoding a keyword

In [10]:
#Completando valores vacios
merged['keyword'].fillna('undefined', inplace=True)
kw_group = merged.groupby('keyword')['target'].mean().reset_index()
kw_group.columns = ['keyword', 'keyword_cv_mean_enc']

#Agrego la columna keyword_cv_mean_enc
merged = pd.merge(merged, kw_group, how='left', on='keyword')

In [11]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10766 entries, 0 to 10765
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        10766 non-null  int64  
 1   keyword                   10766 non-null  object 
 2   location                  7186 non-null   object 
 3   text                      10766 non-null  object 
 4   target                    7503 non-null   float64
 5   country                   7503 non-null   object 
 6   city                      7503 non-null   object 
 7   lat                       4273 non-null   float64
 8   lon                       4273 non-null   float64
 9   source                    10766 non-null  object 
 10  words                     10766 non-null  object 
 11  real_words                10766 non-null  object 
 12  clean_text                10766 non-null  object 
 13  punctuation_signs         10766 non-null  object 
 14  hashta

In [12]:
merged.head()

Unnamed: 0,id,keyword,location,text,target,country,city,lat,lon,source,...,stopwords_count,words_length_avg,punctuations_ratio,hashtags_ratio,mentions_ratio,urls_ratio,stopwords_ratio,real_words_ratio,special_entities_ratio,keyword_cv_mean_enc
0,1,undefined,,Our Deeds are the Reason of this #earthquake M...,1.0,undefined,undefined,,,train,...,6,4.666667,0.0,0.076923,0.0,0.0,0.461538,0.461538,0.538462,0.660714
1,4,undefined,,Forest fire near La Ronge Sask. Canada,1.0,undefined,undefined,,,train,...,0,4.428571,0.125,0.0,0.0,0.0,0.0,0.875,0.125,0.660714
2,5,undefined,,All residents asked to 'shelter in place' are ...,1.0,undefined,undefined,,,train,...,11,7.090909,0.12,0.0,0.0,0.0,0.44,0.44,0.56,0.660714
3,6,undefined,,"13,000 people receive #wildfires evacuation or...",1.0,undefined,undefined,,,train,...,1,7.8,0.125,0.125,0.0,0.0,0.125,0.625,0.375,0.660714
4,7,undefined,,Just got sent this photo from Ruby #Alaska as ...,1.0,undefined,undefined,,,train,...,7,4.571429,0.0,0.125,0.0,0.0,0.4375,0.4375,0.5625,0.660714


In [13]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [14]:
#Si tira error porque no lo reconoce ejecutar el siguiente codigo
#import nltk
#nltk.download('wordnet')

#Si tira error por no encontrar el 'averaged_perceptron_tagger' ejecutar el siguiente codigo
#nltk.download('averaged_perceptron_tagger')

def getWordNetPOSTag(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatizer(text):
    lemmatizer = WordNetLemmatizer()
    lemmaWords = []
    for w in text.split(" "):
        #lemmaWords.append(lemmatizer.lemmatize(w, getWordNetPOSTag(w)))
        lemmaWords.append(lemmatizer.lemmatize(w, pos='a')) 
        
    return " ".join(lemmaWords)

def porterStemmer(text):
    porter = PorterStemmer()
    stemmedWords = []
    for w in text.split(" "):
        stemmedWords.append(porter.stem(w)) 
        
    return " ".join(stemmedWords)

def snowballStemmer(text):
    porter = SnowballStemmer("english")
    stemmedWords = []
    for w in text.split(" "):
        stemmedWords.append(porter.stem(w)) 
        
    return " ".join(stemmedWords)

In [15]:
merged['lemma_text'] = merged['clean_text'].apply(lemmatizer)
merged['porter_stemmed_text'] = merged['clean_text'].apply(porterStemmer)
merged['snowball_stemmed_text'] = merged['clean_text'].apply(snowballStemmer)

In [16]:
merged.to_csv('final_train.csv', index=False)