# Natural Language Processing with Disaster Tweets

*Predict which Tweets are about real disasters and which ones are not.*

This competition has as goal to apply the lesson 3 of the FastAI deep learning tutorial's [notebook](https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners) to another dataset.

Additional resource used: [link](https://www.kaggle.com/code/mohamedabdullah/disaster-tweets-solution).

## Setup

In [None]:
!pip install kaggle

In [None]:
!mv .kaggle /root/

In [None]:
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c nlp-getting-started

In [None]:
!unzip nlp-getting-started.zip

## Data exploration

### Data loading and structure

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('./train.csv')

In [None]:
df.head()

In [None]:
df.describe(include='object')

In [None]:
df.isnull().sum().plot(kind='bar')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

color = [sns.xkcd_rgb['medium blue'], sns.xkcd_rgb['pale red']]
sns.countplot(x='target',data = df, palette = color)
plt.gca().set_ylabel('Samples')

### Distribution of character, word,  and sentence frequency

In [None]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize, sent_tokenize

# count number of characters in each tweet
df['char_len'] = df.text.str.len()

# count number of words in each tweet
word_tokens = [len(word_tokenize(tweet)) for tweet in df.text]
df['word_len'] = word_tokens

# count number of sentence in each tweet
sent_tokens = [len(sent_tokenize(tweet)) for tweet in df.text]
df['sent_len'] = sent_tokens

plot_cols = ['char_len','word_len','sent_len']
plot_titles = ['Character Length','Word Length','Sentence Length']

plt.figure(figsize=(20,4))
for counter, i in enumerate([0,1,2]):
    plt.subplot(1,3,counter+1)
    sns.distplot(df[df.target == 1][plot_cols[i]], label='Disaster', color=color[1]).set_title(plot_titles[i])
    sns.distplot(df[df.target == 0][plot_cols[i]], label='Non-Disaster', color=color[0])
    plt.legend()

In [None]:
# Investigate the Outliers

df[df.sent_len > 8]
df[df.word_len > 50]

# => make sure to deal with the punctuation

### Plot most common stopwords

In [None]:
## Plot most common stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

# Get all the word tokens in dataframe for Disaster and Non-Disaster
corpus0 = [] # Non-Disaster
[corpus0.append(word.lower()) for tweet in df[df.target == 0].text for word in word_tokenize(tweet)]
corpus1 = [] # Disaster
[corpus1.append(word.lower()) for tweet in df[df.target == 1].text for word in word_tokenize(tweet)]

# Function for counting top stopwords in a corpus
def count_top_stopwords(corpus):
    stopwords_freq = {}
    for word in corpus:
        if word in stop:
            if word in stopwords_freq:
                stopwords_freq[word] += 1
            else:
                stopwords_freq[word] = 1
    topwords = sorted(stopwords_freq.items(), key=lambda item: item[1], reverse=True)[:10] # get the top 10 stopwords
    x,y = zip(*topwords) # get key and values
    return x,y

x0,y0 = count_top_stopwords(corpus0)
x1,y1 = count_top_stopwords(corpus1)

# Plot bar plot of top stopwords for each class
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
plt.bar(x0,y0, color=color[0])
plt.title('Top Stopwords for Non-Disaster Tweets')
plt.subplot(1,2,2)
plt.bar(x1,y1, color=color[1])
plt.title('Top Stopwords for  Disaster Tweets')

### Plot most common punctuation

In [None]:
## Plot most common punctuations

from string import punctuation

# Get all the punctuations in dataframe for Disaster and Non-Disaster
corpus0 = [] # Non-Disaster
[corpus0.append(c) for tweet in df[df.target == 0].text for c in tweet]
corpus0 = list(filter(lambda x: x in punctuation, corpus0)) # use filter to select only punctuations
corpus1 = [] # Disaster
[corpus1.append(c) for tweet in df[df.target == 1].text for c in tweet]
corpus1 = list(filter(lambda x: x in punctuation, corpus1))

from collections import Counter
x0,y0 = zip(*Counter(corpus0).most_common())
x1,y1 = zip(*Counter(corpus1).most_common())

# Plot bar plot of top punctuations for each class
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
plt.bar(x0,y0, color=color[0])
plt.title('Top Punctuations for Non-Disaster Tweets')
plt.subplot(1,2,2)
plt.bar(x1,y1, color=color[1])
plt.title('Top Punctuations for Disaster Tweets')

### Plot most common words

In [None]:
## Plot most common words
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stop = ENGLISH_STOP_WORDS.union(stop) # combine stop words from different sources

# function for removing url from text
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

# Get all the word tokens in dataframe for Disaster and Non-Disaster
# - remove url, tokenize tweet into words, lowercase words
corpus0 = [] # Non-Disaster
[corpus0.append(word.lower()) for tweet in df[df.target == 0].text for word in word_tokenize(remove_url(tweet))]
corpus0 = list(filter(lambda x: x not in stop, corpus0)) # use filter to unselect stopwords

corpus1 = [] # Disaster
[corpus1.append(word.lower()) for tweet in df[df.target == 1].text for word in word_tokenize(remove_url(tweet))]
corpus1 = list(filter(lambda x: x not in stop, corpus1)) # use filter to unselect stopwords

# Create df for word counts to use sns plots
a = Counter(corpus0).most_common()
df0 = pd.DataFrame(a, columns=['Word','Count'])

a = Counter(corpus1).most_common()
df1 = pd.DataFrame(a, columns=['Word','Count'])

# Plot for Disaster and Non-Disaster
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.barplot(x='Word',y='Count',data=df0.head(10), color=color[1]).set_title('Most Common Words for Non-Disasters')
plt.xticks(rotation=45)
plt.subplot(1,2,2)
sns.barplot(x='Word',y='Count',data=df1.head(10), color=color[0]).set_title('Most Common Words for Disasters')
plt.xticks(rotation=45)

### Wordcloud for hashtags

In [None]:
def clean(word):
    for p in punctuation: word = word.replace(p, '')
    return word

from wordcloud import WordCloud

def wc_hash(target):
    hashtag = [clean(w[1:].lower()) for tweet in df[df.target == target].text for w in tweet.split() if '#' in w and w[0] == '#']
    hashtag = ' '.join(hashtag)
    my_cloud = WordCloud(background_color='white', stopwords=stop).generate(hashtag)

    plt.subplot(1,2,target+1)
    plt.imshow(my_cloud, interpolation='bilinear')
    plt.axis("off")

plt.figure(figsize=(15,4))
wc_hash(0)
plt.title('Non-Disaster')
wc_hash(1)
plt.title('Disaster')

## Meta-feature engineering

In [None]:
from textblob import TextBlob

### Polarity and subjectivity

In [None]:
df['polarity'] = [TextBlob(tweet).sentiment.polarity for tweet in df.text]
df['subjectivity'] = [TextBlob(tweet).sentiment.subjectivity for tweet in df.text]

### Exclamation and question marks

In [None]:
df['exclaimation_num'] = [tweet.count('!') for tweet in df.text]
df['questionmark_num'] = [tweet.count('?') for tweet in df.text]

### Counting number of hashtags and mentions

In [None]:
def count_url_hashtag_mention(text):
    urls_num = len(re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text))
    word_tokens = text.split()
    hash_num = len([word for word in word_tokens if word[0] == '#' and word.count('#') == 1]) # only appears once in front of word
    mention_num = len([word for word in word_tokens if word[0] == '@' and word.count('@') == 1]) # only appears once in front of word
    return urls_num, hash_num, mention_num

In [None]:
url_num, hash_num, mention_num = zip(*[count_url_hashtag_mention(tweet) for tweet in df.text])
df = df.assign(url_num = url_num, hash_num = hash_num, mention_num = mention_num)

### Number of contractions (e.g I'm, we're, we've)

In [None]:
contractions = ["'t", "'re", "'s", "'d", "'ll", "'ve", "'m"]
df['contraction_num'] = [sum([tweet.count(cont) for cont in contractions]) for tweet in df.text]

## Text data cleaning

### Remove or replace data

In [None]:
## Replace NaNs with 'None'
df.keyword.fillna('None', inplace=True)

In [None]:
## Expand Contractions
def decontraction(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

df.text = [decontraction(tweet) for tweet in df.text]

In [None]:
# Remove Emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df.text = df.text.apply(lambda x: remove_emoji(x))

In [None]:
# Remove URLs
df.text = df.text.apply(lambda x: remove_url(x))

In [None]:
# Remove Punctuations except '!?'

def remove_punct(text):
    new_punct = re.sub('\ |\!|\?', '', punctuation)
    table=str.maketrans('','',new_punct)
    return text.translate(table)

df.text = df.text.apply(lambda x: remove_punct(x))

In [None]:
# Replace amp
def replace_amp(text):
    text = re.sub(r" amp ", " and ", text)
    return text

df.text = df.text.apply(lambda x: replace_amp(x))

### Word segmentation

In [None]:
!pip install wordsegment

In [None]:
from wordsegment import load, segment
load()

df.text = df.text.apply(lambda x: ' '.join(segment(x)))

### Lemmatization

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemma(text):
    words = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(w.lower(), pos='v') for w in words])

df.text = df.text.apply(lambda x: lemma(x))

### Ngrams

In [None]:
# Ngrams
from nltk.util import ngrams

def generate_ngrams(text, n):
    words = word_tokenize(text)
    return [' '.join(ngram) for ngram in list(get_data(ngrams(words, n))) if not all(w in stop for w in ngram)] # exclude if all are stopwords

In [None]:
def get_data(gen):
    try:
        for elem in gen:
            yield elem
    except (RuntimeError, StopIteration):
        return

In [None]:
# Bigrams
bigrams_disaster = df[df.target==1].text.apply(lambda x: generate_ngrams(x, 2))
bigrams_ndisaster = df[df.target==0].text.apply(lambda x: generate_ngrams(x, 2))

bigrams_d_dict = {}
for bgs in bigrams_disaster:
    for bg in bgs:
        if bg in bigrams_d_dict:
            bigrams_d_dict[bg] += 1
        else:
            bigrams_d_dict[bg] = 1

bigrams_d_df = pd.DataFrame(bigrams_d_dict.items(), columns=['Bigrams','Count'])

bigrams_nd_dict = {}
for bgs in bigrams_ndisaster:
    for bg in bgs:
        if bg in bigrams_nd_dict:
            bigrams_nd_dict[bg] += 1
        else:
            bigrams_nd_dict[bg] = 1

bigrams_nd_df = pd.DataFrame(bigrams_nd_dict.items(), columns=['Bigrams','Count'])

In [None]:
# Barplots for bigrams

plt.figure(figsize=(15,10))
plt.subplot(1,2,1)
sns.barplot(x='Count',y='Bigrams',data=bigrams_nd_df.sort_values('Count', ascending=False).head(40), color=color[0]).set_title('Most Common Bigrams for Non-Disasters')
ax = plt.gca()
ax.set_ylabel('')
plt.subplot(1,2,2)
sns.barplot(x='Count',y='Bigrams',data=bigrams_d_df.sort_values('Count', ascending=False).head(40), color=color[1]).set_title('Most Common Bigrams for Disasters')
ax = plt.gca()
ax.set_ylabel('')
plt.tight_layout()
plt.show()


In [None]:
# Wordcloud for bigrams

plt.figure(figsize=(15,10))
plt.subplot(1,2,1)
my_cloud = WordCloud(background_color='white', stopwords=stop).generate_from_frequencies(bigrams_nd_dict)
plt.imshow(my_cloud, interpolation='bilinear')
plt.axis('off')

plt.subplot(1,2,2)
my_cloud = WordCloud(background_color='white', stopwords=stop).generate_from_frequencies(bigrams_d_dict)
plt.imshow(my_cloud, interpolation='bilinear')
plt.axis('off')

plt.show()

In [None]:
# Trigrams

trigrams_disaster = df[df.target==1].text.apply(lambda x: generate_ngrams(x, 3))
trigrams_ndisaster = df[df.target==0].text.apply(lambda x: generate_ngrams(x, 3))

trigrams_d_dict = {}
for tgs in trigrams_disaster:
    for tg in tgs:
        if tg in trigrams_d_dict:
            trigrams_d_dict[tg] += 1
        else:
            trigrams_d_dict[tg] = 1

trigrams_d_df = pd.DataFrame(trigrams_d_dict.items(), columns=['Trigrams','Count'])

trigrams_nd_dict = {}
for tgs in trigrams_ndisaster:
    for tg in tgs:
        if tg in trigrams_nd_dict:
            trigrams_nd_dict[tg] += 1
        else:
            trigrams_nd_dict[tg] = 1

trigrams_nd_df = pd.DataFrame(trigrams_nd_dict.items(), columns=['Trigrams','Count'])

In [None]:
# Barplots for trigrams

plt.figure(figsize=(15,10))
plt.subplot(1,2,1)
sns.barplot(x='Count',y='Trigrams',data=trigrams_nd_df.sort_values('Count', ascending=False).head(40), color=color[0]).set_title('Most Common Trigrams for Non-Disasters')
ax = plt.gca()
ax.set_ylabel('')
plt.subplot(1,2,2)
sns.barplot(x='Count',y='Trigrams',data=trigrams_d_df.sort_values('Count', ascending=False).head(40), color=color[1]).set_title('Most Common Trigrams for Disasters')
ax = plt.gca()
ax.set_ylabel('')
plt.tight_layout()
plt.show()

### Remove stopwords

In [None]:
## Remove Stopwords
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    return ' '.join([w.lower() for w in word_tokens if not w.lower() in stop])

#tweets_tmp = tweets.copy()
df['text_nostopwords'] = df.text.apply(lambda x: remove_stopwords(x))

### Plot word cloud for most common words after cleaning

In [None]:
from PIL import Image
mask = np.array(Image.open('twitter.png'))
reverse = mask[...,::-1,:]

In [None]:
def wc_words(target, mask=mask):
    words = [word.lower() for tweet in df[df.target == target].text_nostopwords for word in tweet.split()]
    words = list(filter(lambda w: w != 'like', words))
    words = list(filter(lambda w: w != 'new', words))
    words = list(filter(lambda w: w != 'people', words))
    dict = {}
    for w in words:
        if w in dict:
            dict[w] += 1
        else:
            dict[w] = 1
    # plot using frequencies
    my_cloud = WordCloud(background_color='white', stopwords=stop, mask=mask, random_state=0).generate_from_frequencies(dict)

    plt.subplot(1,2,target+1)
    plt.imshow(my_cloud, interpolation='bilinear')
    plt.axis("off")

plt.figure(figsize=(15,10))
wc_words(0)
plt.title('Non-Disaster')
wc_words(1, reverse)
plt.title('Disaster')
plt.show()

In [None]:
pd.options.display.max_colwidth = 200
for t in df['text'].sample(n=20, random_state=0):
    print(t)
pd.reset_option('max_colwidth')

In [None]:
pd.reset_option('max_colwidth')
df.drop('text_nostopwords', axis=1, inplace=True)
df.head()

## Tokenization

### Finalizing dataframe setup

In [None]:
!pip install datasets

In [None]:
df = df.drop(['location'], axis=1)

In [None]:
df.head()

In [None]:
df['input'] = 'TEXT: ' + df.text_nostopwords + '; KEYWORD: ' + df.keyword

In [None]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)

In [None]:
model_nm = 'bert-base-uncased'

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

In [None]:
def tok_func(x): return tokz(x["input"])

In [None]:
tok_ds = ds.map(tok_func, batched=True)

In [None]:
row = tok_ds[0]
row['input'], row['input_ids']

In [None]:
tokz.tokenize(ds["input"][0])

In [None]:
tokz.vocab['earthquake']

In [None]:
tok_ds = tok_ds.rename_columns({'target':'labels'})

In [None]:
tok_ds

## Test and validation sets

In [None]:
eval_df = pd.read_csv('./test.csv')
eval_df.describe()

### Validation set

In [None]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

### Test set

In [None]:
eval_df['input'] = 'TEXT: ' + df.text + '; KEYWORD: ' + df.keyword
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

### Measure coef

In [None]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
def f1_val(eval_pred): return {'f1_val': f1_score(*eval_pred)}

## Training

### Training our model

In [None]:
!pip install transformers[torch]

In [None]:
from transformers import TrainingArguments,Trainer

In [None]:
bs = 128
epochs = 4

In [None]:
lr = 8e-5

In [None]:
df.dtypes

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=False,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=f1_val)

In [None]:
trainer.train();

In [None]:
df.dtypes