<a href="https://colab.research.google.com/github/darasliwinski/nlp_disaster/blob/update_lower/nlpDisaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries and load data

In [1]:
import pandas as pd
import numpy as np

# For cleaning the text
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import regex as re
import string

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_colwidth', None)

# For building our model
import tensorflow.keras
import sklearn
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D

In [2]:
import tensorflow as tf
import tensorflow_hub as hub

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# move to directory where original notebook is stored
%cd "/content/drive/MyDrive/Colab Notebooks"

/content/drive/MyDrive/Colab Notebooks


In [5]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv') 
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


## Clean Data

In [6]:
nlp = spacy.load('en_core_web_sm')

nltk.download('stopwords')
nltk.download('punkt')

# spacy (362 words)
spacy_st = nlp.Defaults.stop_words
# nltk(179 words)
nltk_st = stopwords.words('english')

def clean(tweet, http = True, punc = True, lem = True, stop_w = True):
    
    if http is True:
        tweet = re.sub("https?:\/\/t.co\/[A-Za-z0-9]*", '', tweet)

    # stop words
    if stop_w == 'nltk':
        tweet = [word for word in word_tokenize(tweet) if not word.lower() in nltk_st]
        tweet = ' '.join(tweet)

    elif stop_w == 'spacy':
        tweet = [word for word in word_tokenize(tweet) if not word.lower() in spacy_st]
        tweet = ' '.join(tweet)

    # lemmitizing
    if lem == True:
        lemmatized = [word.lemma_ for word in nlp(tweet)]
        tweet = ' '.join(lemmatized)

    # punctuation removal
    if punc is True:
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
        
    # removing extra space
    tweet = re.sub("\s+", ' ', tweet)
    
    return tweet

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
# apply function to clean data
df['cleaned_text'] = df.text.apply(lambda x: clean(x, lem = False, stop_w = 'nltk', http = True, punc = True))
#test['cleaned_text'] = test.text.apply(lambda x: clean(x, lem = False, stop_w = 'nltk', http = True, punc = True))

In [8]:
# convert to lower
df['cleaned_text'] = df['cleaned_text'].apply(lambda x:x.lower())
#test['cleaned_text'] = test['cleaned_text'].apply(lambda x:x.lower())


In [9]:
df = df[["cleaned_text", "target"]]
df.columns = ["text", "labels"]

In [10]:
df

Unnamed: 0,text,labels
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officers evacuation shelter place orders expected,1
3,13000 people receive wildfires evacuation orders california,1
4,got sent photo ruby alaska smoke wildfires pours school,1
...,...,...
7608,two giant cranes holding bridge collapse nearby homes,1
7609,ariaahrary thetawniest control wild fires california even northern part state troubling,1
7610,m194 0104 utc 5km volcano hawaii,1
7611,police investigating ebike collided car little portugal ebike rider suffered serious nonlife threatening injuries,1


In [11]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
eval_df, test_df = train_test_split(test_df, test_size=0.50, random_state=42)

In [15]:
len(test_df)

571

In [None]:
import csv
import os
import torch
!pip install simpletransformers
from transformers import pipeline
import gc
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.special import softmax
from simpletransformers.classification import (ClassificationModel, ClassificationArgs)
import sklearn
from sklearn.model_selection import train_test_split

In [17]:

# Create a ClassificationModel
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 42
model_args.best_model_dir = "/kaggle/working/best_model"
model_args.output_dir = "/kaggle/temp/output"
model_args.normalization = True #this enables the built-in Bertweet custom tokenizer

model_args.reprocess_input_data = True
model_args.evaluate_during_training = True
#model_args.evaluate_during_training_verbose = True
model_args.train_batch_size = 80
model_args.eval_batch_size = 80

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            use_cuda = True,
                            args = model_args, 
                            num_labels = 2)

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

Downloading:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
model.train_model(train_df, eval_df=eval_df,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score)

  0%|          | 0/6471 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

(162,
 defaultdict(list,
             {'global_step': [81, 162],
              'train_loss': [0.4604591131210327, 0.28260868787765503],
              'mcc': [0.6295174348884925, 0.6698970367836923],
              'tp': [196, 194],
              'tn': [271, 285],
              'fp': [55, 41],
              'fn': [49, 51],
              'auroc': [0.8811881807937898, 0.8986290221610116],
              'auprc': [0.8734192398579267, 0.8962640076543981],
              'acc': [0.8178633975481612, 0.8388791593695272],
              'f1': [0.7903225806451613, 0.8083333333333333],
              'eval_loss': [0.44867344573140144, 0.422227218747139]}))

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)

  0%|          | 0/571 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
result

{'mcc': 0.6714696685428393,
 'tp': 200,
 'tn': 279,
 'fp': 46,
 'fn': 46,
 'auroc': 0.8984177611006879,
 'auprc': 0.9018956145123849,
 'acc': 0.8388791593695272,
 'f1': 0.8130081300813008,
 'eval_loss': 0.4157301262021065}

In [19]:
predictions, raw_outputs = model.predict(test_df["text"].to_list())

  0%|          | 0/571 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [20]:
mypreds = pd.DataFrame(test_df)
mypreds["preds"] = predictions

In [21]:
mypreds

Unnamed: 0,text,labels,preds
5113,half poll respondents worry nuclear disaster fading public consciousness fukushima,0,1
6119,horrible sinking feeling youûªve home phone realise 3g whole time,1,0
6076,davidcovucci ca nt sinkhole swallowed every taco place neighborhood,0,0
1129,wasp dive bombed face,0,0
3794,133 n past 5 l lane reopened lanes closed lanes open 133 s trash truck fire cleanup knx1070,1,1
...,...,...,...
5023,someone split mudslide w get work,0,0
2418,breakfast links work home derailed empty train derailed smithsonian morning suspending ser,1,1
3357,rocky fire northern california swells 60000 acres 12000 evacuated portland phoenix miami atlanta casper,1,1
1606,get feeling society collapse implode nt hero play part,0,0


In [22]:
from sklearn.metrics import confusion_matrix
confusion_matrix(mypreds['labels'], mypreds['preds'])

array([[272,  53],
       [ 53, 193]])

In [24]:
from sklearn import metrics
metrics.roc_auc_score(mypreds['labels'], mypreds['preds'])

0.8107379612257661

In [26]:
metrics.f1_score(mypreds['labels'], mypreds['preds'])

0.7845528455284553

In [27]:
metrics.roc_auc_score(mypreds['labels'], mypreds['preds'])

0.8107379612257661