In [None]:
# import nltk
# nltk.download('all')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn import metrics
import matplotlib.pyplot as plt
import string, re, emoji

# Limpieza de textos
from pattern.text.en import singularize, lemma
from nltk.corpus import stopwords
from wordcloud import WordCloud

from transformers import AutoTokenizer,TFBertModel
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy,BinaryAccuracy
from tensorflow.keras.utils import plot_model
import tensorflow
from sklearn.metrics import accuracy_score

pd.options.display.max_rows = None

# Carga de datos

In [None]:
'''
  id - a unique identifier for each tweet
  text - the text of the tweet
  location - the location the tweet was sent from (may be blank)
  keyword - a particular keyword from the tweet (may be blank)
  target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)
'''
train = pd.read_csv('./train.csv', encoding='utf8')
test = pd.read_csv('./test.csv', encoding='utf8')

cachedStopWords = stopwords.words('english')

# Analisis Exploratorio I

In [None]:
train['location'].value_counts().head(10).plot.bar()

In [None]:
train['keyword'].value_counts().head(10).plot.bar()

In [None]:
train['target'].value_counts().plot.bar()

In [None]:
train['text']

# Limpieza de datos

In [None]:
train.columns

## Location

In [None]:
def cleanCountry(value):
  try:

    if (
      'nan' in value or
      'world' in value or
      'global' in value or
      'everywhere' in value or
      'earth' in value or
      'ss' in value or
      '?' in value or
      re.search(r"[0-9]+", value, re.I)
    ):
      return 'unknow'

    if (
      'italy' in value
    ):
      return 'italy'

    if (
      'india' in value or
      'mumbai' in value
    ):
      return 'india'
    
    if (
      'switzerland' in value or
      'geneva' in value
    ):
      return 'switzerland'
    
    if (
      'china' in value or
      'hong kong' in value
    ):
      return 'china'

    if (
      'nigeria' in value or
      'lagos' in value
    ):
      return 'nigeria'

    if (
      'japan' in value or
      'tokyo' in value
    ):
      return 'japan'

    if (
      'ontario' in value or
      'canada' in value or
      'toronto' in value or
      'calgary' in value or
      'alberta' in value or
      re.search(r"ab$", value, re.I) or
      re.search(r"bc$", value, re.I)
    ):
      return 'canada'

    if (
      'uk' == value or
      'united kingdom' in value or
      'kingdom' in value or
      'british' in value or
      'scotland' in value or
      'newcastle' in value or
      'england' in value or
      'london' in value or
      re.search(r"uk$", value, re.I)
    ):
      return 'uk'

    if (
      'nyc' == value or
      'nj' == value or
      'united states' in value or
      'new york' in value or
      'san francisco' in value or
      'los angeles' in value or
      'new jersey' in value or
      'north carolina' in value or
      'st. louis' in value or
      'kansas city' in value or
      'san diego' in value or
      'las vegas' in value or
      'sacramento' in value or
      'oregon' in value or
      'michigan' in value or
      'manchester' in value or
      'portland' in value or
      'texas' in value or
      'u.s.' in value or
      'philippines' in value or
      'nevada' in value or
      'us' in value or
      'arizona' in value or
      'lincoln' in value or
      'wisconsin' in value or
      'pennsylvania' in value or
      'seattle' in value or
      'usa' in value or
      'washington' in value or
      'florida' in value or
      'chicago' in value or
      'california' in value or
      'nashville' in value or
      'colorado' in value or
      'denver' in value or
      'cleveland' in value or
      'atlanta' in value or
      'massachusetts' in value or
      'boston' in value or
      'oklahoma' in value or
      'tennessee' in value or
      'liverpool' in value or
      'phoenix' in value or
      'baltimore' in value or
      re.search(r"nyc$", value, re.I) or
      re.search(r"hi$", value, re.I) or
      re.search(r"va$", value, re.I) or
      re.search(r"ks$", value, re.I) or
      re.search(r"la$", value, re.I) or
      re.search(r"ak$", value, re.I) or
      re.search(r"md$", value, re.I) or
      re.search(r"mo$", value, re.I) or
      re.search(r"wi$", value, re.I) or
      re.search(r"az$", value, re.I) or
      re.search(r"ga$", value, re.I) or
      re.search(r"ok$", value, re.I) or
      re.search(r"nj$", value, re.I) or
      re.search(r"wa$", value, re.I) or
      re.search(r"pa$", value, re.I) or
      re.search(r"ma$", value, re.I) or
      re.search(r"co$", value, re.I) or
      re.search(r"oh$", value, re.I) or
      re.search(r"il$", value, re.I) or
      re.search(r"tn$", value, re.I) or
      re.search(r"dc$", value, re.I) or
      re.search(r"ca$", value, re.I) or
      re.search(r"tx$", value, re.I) or
      re.search(r"nc$", value, re.I) or
      re.search(r"fl$", value, re.I) or
      re.search(r"ny$", value, re.I)
    ):
      return 'usa'

    return value
  except:
    return 'unknow'

train['location'] = train['location'].apply(lambda row: str(row).lower())
train['location'] = train['location'].apply(lambda row: cleanCountry(row))

## Keyword

In [None]:
def toSingular(value):
  try:
    return singularize(value)
  except:
    return value

def parseLemma(value):
  try:
    return lemma(value)
  except:
    return value

def replaceSpace(value):
  return str(value).replace('%20', ' ')

train['keyword'] = train['keyword'].apply(lambda row: toSingular(row))
train['keyword'] = train['keyword'].apply(lambda row: parseLemma(row))
train['keyword'] = train['keyword'].apply(lambda row: replaceSpace(row))
train['keyword'].value_counts()

## Text

In [None]:
def removeHastags(value):
  if ('#' not in value): return value
  hashtags = re.findall(r"#[^\s]*", value, re.I)
  for hashtag in hashtags:
    value = value.replace(hashtag, '')
  return value

def removeLinks(value):
  if ('http' not in value): return value
  links = re.findall(r"http[^\s]*", value, re.I)
  for link in links:
    value = value.replace(link, '')
  return value

def removeStepWords(value):
  return ' '.join([word for word in value.split() if word not in cachedStopWords])

def extractEmojis(value):
  items = value.split(' ')
  emojis = ''.join(item for item in items if item in emoji.EMOJI_DATA)

  if (len(emojis) > 0): print(value)

  return value

def sentenceToSingular(value):
  items = value.split(' ')
  for item in items:
    singular = toSingular(item)
    value = value.replace(item, singular)
  return value

def sentenceToPresent(value):
  items = value.split(' ')
  for item in items:
    present = parseLemma(item)
    value = value.replace(item, present)
  return value

translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

cleanText = []
for index, row in train.iterrows():
  text = row['text'][:]

  # Limpiando el texto
  text = text.lower()                         # Convierte todo a minusculas
  text = text.replace('utc', '')              # Quita utc
  text = text.replace('#', '')                # Quita #
  text = text.replace('@', '')                # Quita @
  text = removeLinks(text)                    # Quita links
  text = extractEmojis(text)                  # Quita todos los emojis
  text = text.translate(translator)           # Quita todos los signos de puntuacion
  text = removeStepWords(text)                # Quita todas las step words
  text = re.sub('  +', ' ', text)             # Quita todos los espacios de mas
  text = sentenceToSingular(text)             # Pasa las palabras a singular
  text = sentenceToPresent(text)              # Pasa las palabras a presente
  
  numbers = re.findall(r"[0-9]", text, re.I)
  if (len(numbers) > 0):
    for number in numbers:
      if (number == '911'): continue

      # Quitando numeros
      text = text.replace(number, '')

  text = removeStepWords(text)                # Quita todas las step words
  text = re.sub('  +', ' ', text)             # Quita todos los espacios de mas

  cleanText.append(text)

train['text'] = cleanText[:]

In [None]:
for index, row in train.iterrows():
  print(row['text'])

# Analisis Exploratorio II

In [None]:
train['keyword'].value_counts().head(10).plot.bar()

In [None]:
train['location'].value_counts().head(10).plot.bar()

In [None]:
train.loc[train['target'] == 0]['keyword'].value_counts().head(15).plot.bar()

In [None]:
train.loc[train['target'] == 1]['keyword'].value_counts().head(15).plot.bar()

In [None]:
#Disaster Tweets wordcloud 
disaster_tweets = train[train['target'] == 1]
disaster_string = []
for t in disaster_tweets.text:
    disaster_string.append(t)
disaster_string = pd.Series(disaster_string).str.cat(sep=' ')
wordcloud = WordCloud(width=1600, height=800,max_font_size=100, background_color='white').generate(disaster_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Positive tweets wordcloud
formal_tweets = train[train["target"] == 0]
formal_string = []
for t in formal_tweets.text:
    formal_string.append(t)
formal_string = pd.Series(formal_string).str.cat(sep=' ')
wordcloud = WordCloud(width=1600, height=800,max_font_size=100, background_color='white').generate(formal_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# BERT 

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
bert = TFBertModel.from_pretrained('bert-large-uncased')

In [None]:
tokenizer('Prediciendo tweets')

In [None]:
print("max len of tweets",max([len(x.split()) for x in train['text']]))
max_length = 36

In [None]:
x_train = tokenizer(
    text=train['text'].tolist(),
    add_special_tokens=True,
    max_length=36,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
x_train['input_ids'].shape

In [None]:
x_train['attention_mask'].shape

In [None]:
y_train = train['target'].values

In [None]:
y_train

In [None]:
train['target'].value_counts()

## Construyendo el modelo

In [None]:
input_ids = Input(shape=(max_length,), dtype=tensorflow.int32, name="input_ids")
input_mask = Input(shape=(max_length,), dtype=tensorflow.int32, name="attention_mask")

embeddings = bert(input_ids,attention_mask = input_mask)[1] 
out = tensorflow.keras.layers.Dropout(0.1)(embeddings)

out = Dense(128, activation='relu')(out)
out = tensorflow.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)

y = Dense(1,activation = 'sigmoid')(out)
    
model = tensorflow.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True


In [None]:
model.summary()

In [None]:
optimizer = Adam(
    learning_rate=6e-06, # this learning rate is for bert model.
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = BinaryCrossentropy(from_logits = True)
metric = BinaryAccuracy('accuracy'),

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

## Entrenamiento del modelo

In [None]:
# Fit the model
final = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
    epochs=4,
    batch_size=10
)

## Precision y perdida

In [None]:
def visual_accuracy_and_loss(final):
    acc = final.history['accuracy']
    loss = final.history['loss']
    epochs_plot = np.arange(1, len(loss) + 1)
    plt.clf()
    plt.plot(epochs_plot, acc, 'r', label='Accuracy')
    plt.plot(epochs_plot, loss, 'b:', label='Loss')
    plt.title('VISUALIZATION OF LOSS AND ACCURACY CURVE')
    plt.xlabel('Epochs')
    plt.legend()
    plt.show()

visual_accuracy_and_loss(final)

In [None]:
x_test = tokenizer(
    text=test.text.tolist(),
    add_special_tokens=True,
    max_length=36,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
x_test

## Prediccion

In [None]:
predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})

In [None]:
y_predicted = np.where(predicted>0.5,1,0)

In [None]:
y_predicted = y_predicted.reshape((1,3263))[0]

In [None]:
y_predicted

In [None]:
result = pd.DataFrame()
result['id'] = test.id
result['text'] = test.text
result['target'] = y_predicted

In [None]:
result.head()

## Comprobando las predicciones

In [None]:
X_test_prediction = tokenizer(
    text=train['text'].tolist(),
    add_special_tokens=True,
    max_length=36,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
test_prediction = model.predict({'input_ids':X_test_prediction['input_ids'],'attention_mask':X_test_prediction['attention_mask']})

In [None]:
y_predicted_test = np.where(test_prediction>0.5,1,0)

In [None]:
y_predicted_test = y_predicted_test.reshape((1,len(y_predicted_test)))[0]

In [None]:
train['predicted_val'] = y_predicted_test

In [None]:
print("Accuracy:", accuracy_score(train['target'], train['predicted_val']))

# Funcion de prediccion

In [None]:
def functionToPredict(quant):
    print('testing...', test_data[0:quant])

    text_to_predict = tokenizer(
        text=test_data,
        add_special_tokens=True,
        max_length=36,
        truncation=True,
        padding=True, 
        return_tensors='tf',
        return_token_type_ids = False,
        return_attention_mask = True,
        verbose = True
    )

    test_prediction = model.predict({'input_ids':text_to_predict['input_ids'],'attention_mask':text_to_predict['attention_mask']})

    y_predicted_test = np.where(test_prediction>0.5,1,0)

    y_predicted_test = y_predicted_test.reshape((1,len(y_predicted_test)))[0]

    for i in range(quant):
        if (y_predicted_test[quant] == 0):
            print('Identificado como no-desastre')
        else:
            print('Identificado como desastre')
    
    return y_predicted_test

In [None]:
test_data = test['text'].tolist()
test_data[0] = 'Yesterday I went to the dentist'
test_data[1] = 'A car crashed in front of me'
test_data[2] = 'My kid is a bomb!'
test_data[3] = 'The airplane had an accident'
test_data[4] = 'Too much traffic!!!'
test_data[5] = 'An earthquake killed 5 persons'
test_data[6] = 'Typhoon destroyed houses'
test_data[7] = 'A girl who died in an airplane accident fifteen years ago'
test_data[8] = 'I am going into a panic attack'

functionToPredict(9)