In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn import metrics
import matplotlib.pyplot as plt
import string, re, emoji

# Limpieza de textos
from pattern.text.en import singularize, lemma
from nltk.corpus import stopwords

pd.options.display.max_rows = None

# Carga de datos

In [None]:
'''
  id - a unique identifier for each tweet
  text - the text of the tweet
  location - the location the tweet was sent from (may be blank)
  keyword - a particular keyword from the tweet (may be blank)
  target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)
'''
train = pd.read_csv('./train.csv', encoding='utf8')

cachedStopWords = stopwords.words('english')

# Analisis Exploratorio I

In [None]:
train['location'].value_counts().head(10).plot.bar()

In [None]:
train['keyword'].value_counts().head(10).plot.bar()

In [None]:
train['target'].value_counts().plot.bar()

In [None]:
train['text']

# Limpieza de datos

In [None]:
train.columns

## Keyword

In [None]:
def toSingular(value):
  try:
    return singularize(value)
  except:
    return value

def parseLemma(value):
  try:
    return lemma(value)
  except:
    return value

def replaceSpace(value):
  return str(value).replace('%20', ' ')

train['keyword'] = train['keyword'].apply(lambda row: toSingular(row))
train['keyword'] = train['keyword'].apply(lambda row: parseLemma(row))
train['keyword'] = train['keyword'].apply(lambda row: replaceSpace(row))
train['keyword'].value_counts()

## Text

In [None]:
def removeHastags(value):
  if ('#' not in value): return value
  hashtags = re.findall(r"#[^\s]*", value, re.I)
  for hashtag in hashtags:
    value = value.replace(hashtag, '')
  return value

def removeLinks(value):
  if ('http' not in value): return value
  links = re.findall(r"http[^\s]*", value, re.I)
  for link in links:
    value = value.replace(link, '')
  return value

def removeStepWords(value):
  return ' '.join([word for word in value.split() if word not in cachedStopWords])

def extractEmojis(value):
  items = value.split(' ')
  emojis = ''.join(item for item in items if item in emoji.EMOJI_DATA)

  if (len(emojis) > 0): print(value)

  return value

def sentenceToSingular(value):
  items = value.split(' ')
  for item in items:
    singular = toSingular(item)
    value = value.replace(item, singular)
  return value

def sentenceToPresent(value):
  items = value.split(' ')
  for item in items:
    present = parseLemma(item)
    value = value.replace(item, present)
  return value

translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

cleanText = []
for index, row in train.iterrows():
  text = row['text'][:]

  # Limpiando el texto
  text = text.lower()                         # Convierte todo a minusculas
  text = text.replace('#', '')                # Quita #
  text = text.replace('@', '')                # Quita @
  text = removeLinks(text)                    # Quita links
  text = text.translate(translator)           # Quita todos los signos de puntuacion
  text = removeStepWords(text)                # Quita todas las step words
  text = extractEmojis(text)                  # Quita todos los emojis
  text = re.sub(' +', ' ', text)              # Quita todos los espacios de mas
  text = sentenceToSingular(text)             # Pasa las palabras a singular
  text = sentenceToPresent(text)              # Pasa las palabras a presente
  
  numbers = re.findall(r"[0-9]+", text, re.I)
  if (len(numbers) > 0):
    for number in numbers:
      if (number == '911'): continue

      # Quitando numeros
      text = text.replace(number, '')

  text = text.replace('utc', '')              # Quita utc
  text = removeStepWords(text)                # Quita todas las step words
  text = re.sub(' +', ' ', text)              # Quita todos los espacios de mas

  cleanText.append(text)

train['text'] = cleanText[:]

In [None]:
train['text']

In [None]:
target = train['target']
train = train.drop('target', axis=1)

# Analisis Exploratorio II

In [None]:
train['keyword'].value_counts().head(10).plot.bar()