# Natural Language Processing with Disaster Tweets

### Team: Robyn Tomson and Otto-Cristofer Vanasaun

#### Downloading and importing essential dependencies and Python packages.

In [1]:
!pip install imblearn
!pip install transformers datasets torch scikit-learn
!pip install tf-keras
!pip install tensorflow
!pip install transformers
!pip install --upgrade tensorflow transformers
import os
import re
import string
from time import time
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
)

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from transformers import (
    TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification,
    BertTokenizer, TFGPT2LMHeadModel, GPT2Tokenizer, TFAutoModelForSequenceClassification
)




#### Disabling warnings

In [2]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#### Reading data from csv files

In [3]:
dataTrain = pd.read_csv("data/train.csv")
dataTest = pd.read_csv("data/test.csv")
dataSampleSubmit = pd.read_csv("data/sample_submission.csv")

#### Data description and understanding

In [4]:
disasters = []
locations = [] 
total_words = []
for row in dataTrain.itertuples(index=True):
    if pd.notna(row.keyword): 
        if row.keyword not in disasters:
            disasters.append(row.keyword)
    if pd.notna(row.location): 
        if row.location not in locations:
            locations.append(row.location)
            
    words = row.text.split() 
    for word in words:
        total_words.append(word)
    
    
unique_word_count = set(total_words)

# Get the words occurring the most in the data
words_df = {'word':total_words}
words_df = pd.DataFrame(data=words_df)
populars = words_df.value_counts()


print(dataTrain.info())
print("________________________________________")
print("Total words:", len(total_words))
print("Unique words:", len(unique_word_count))
print("________________________________________")
print("Number of unique disasters:", len(disasters))
print("________________________________________")
print("Number of unique locations:", len(locations))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
________________________________________
Total words: 113461
Unique words: 31924
________________________________________
Number of unique disasters: 221
________________________________________
Number of unique locations: 3341


#### Data preparation

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_len = 1400



nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
     # Remove links from tweet texts
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    
    # Remove special characters, mentions and hashtages, as well as numbers from tweet texts
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords (words that don't add anything meaningful to the text)
    stopword = set(stopwords.words('english'))
    # Manually added stopwords
    extrastopwords = ["im", "like", "get", "dont", "wont", "via", "still", "would", "got", "rt", "cant", "theyre", "bb", "fyi", "hmu", "th", "st", "rd"]
    extrastopwords = set(extrastopwords)
    text = " ".join([word for word in text.split() if word not in stopword])
    text = " ".join([word for word in text.split() if word not in extrastopwords])


    # Create the lemmatizer and apply the tags
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    text_lemmas = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    text_lemmas = " ".join([word for word in text_lemmas])
    return text_lemmas

    # Perform oversampling with SMOTE
def oversampling(data, labels):
    X_SMOTE, y_SMOTE = SMOTE(k_neighbors=5).fit_resample(data, labels)
    return X_SMOTE, y_SMOTE
    
    # Call the text preprocess function and encode the labels
def preprocess_data(data):
    data['text'] = data['text'].apply(preprocess_text)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['target'])

    return data, label_encoder

    # Tokenize the data with BertTokenizer
def tokenize_and_prepare(data, max_length=512):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    data['tokens'] = data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_length, truncation=True))
    token_sequences = pad_sequences(data['tokens'], maxlen=max_length, padding='post')
    return token_sequences


    # Call the preprocess_data function and split the training and validation data
def eeltöötlus(trainset):
    trainset, label_encoder = preprocess_data(trainset)
    train_data, val_data = train_test_split(trainset, test_size=0.2, random_state=42)
    train_labels = train_data['label']
    val_labels = val_data['label']
    
    # We will not be using these columns
    train_data = train_data.drop(columns="location")
    train_data = train_data.drop(columns="keyword")
    val_data = val_data.drop(columns="location")
    val_data = val_data.drop(columns="keyword")

    train_sequences = tokenize_and_prepare(train_data)
    val_sequences = tokenize_and_prepare(val_data)
    train_sequences = pad_sequences(
    train_data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True)),
    maxlen=max_len, padding='post'
    )
    train_sequences, train_labels = oversampling(train_sequences, train_labels)
    return train_sequences, val_sequences, train_labels, val_labels, label_encoder

# Driver to start data preparation
train_sequences, val_sequences, train_labels, val_labels, label_encoder = eeltöötlus(dataTrain)






[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanasauo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vanasauo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vanasauo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\vanasauo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


#### Model building

In [6]:
vocab_size = 30522
embedding_dim = 256

# Dynamic learning rate to prevent overfitting
initial_learning_rate = 0.0001
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)

# Structure of the nlp model
modelLSTM = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(32, dropout=0.2)),
    BatchNormalization(),
    Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    Dropout(0.1),
    Dense(16, activation='relu'),
    Dense(2, activation='softmax')
])

modelLSTM.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])

#### Model training

In [8]:
# Measure time spent on training
algus = time()

# If the performance stagnates, stop the training early
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
try:
    history = modelLSTM.fit(
        train_sequences,
        train_labels,
        epochs=4,
        batch_size=32,
        validation_data=(val_sequences, val_labels),
        callbacks=[early_stopping],
        verbose=1
    )
except KeyError as e:
    print("KeyError encountered:", e)

aeg = time()-algus
print("Aega treenimiseks läks {} minutit ja {} sekundit.".format(round(aeg/60), round(aeg%60,2)))

Epoch 1/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 1s/step - accuracy: 0.5138 - loss: 0.7395 - val_accuracy: 0.5995 - val_loss: 0.7107
Epoch 2/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 1s/step - accuracy: 0.6006 - loss: 0.6865 - val_accuracy: 0.7269 - val_loss: 0.6577
Epoch 3/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 1s/step - accuracy: 0.7445 - loss: 0.5616 - val_accuracy: 0.7446 - val_loss: 0.5500
Epoch 4/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 1s/step - accuracy: 0.8330 - loss: 0.4170 - val_accuracy: 0.7702 - val_loss: 0.5161
Aega treenimiseks läks 19 minutit ja 9.37 sekundit.


In [11]:
# We also predict on training data to get an estimate of how well-fitted the model got to data it already knew
X_trainLSTM = train_sequences
y_trainLSTM = modelLSTM.predict(X_trainLSTM)

# Transform the predictions into 1s and 0s
y_trainLSTM = [np.argmax(vastus) for vastus in y_trainLSTM]
        
# Create confusion matrix for training data
conf_mat = confusion_matrix(train_labels, y_trainLSTM)
print("Training data confusion matrix:")
print(conf_mat)
"""
Confusion matrix structure:
TN FP
FN TP
"""
accuracy = (conf_mat[0][0] + conf_mat[1][1])/(conf_mat[0][0] + conf_mat[0][1] + conf_mat[1][0] + conf_mat[1][1]) #TP + TN / TP + FN + FP + TN
precision = conf_mat[1][1] / (conf_mat[1][1] + conf_mat[0][1]) # TP / TP + FP
recall = conf_mat[1][1] / (conf_mat[1][1] + conf_mat[1][0]) # TP / TP + FN
f1_measure = 2 / ((1/precision) + (1/recall))
print("Accuracy:", accuracy) 
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-measure:", f1_measure)

[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 127ms/step
Training data confusion matrix:
[[3266  202]
 [ 456 3012]]
Accuracy: 0.9051326412918108
Precision:  0.9371499688861232
Recall:  0.8685121107266436
F1-measure: 0.9015264890751271


In [12]:
X_valLSTM = val_sequences
# Predict the validation data
y_valLSTM = modelLSTM.predict(X_valLSTM)
y_val_lower = []
y_1s = []
y_0s = []
# Transform the predictions into 0s and 1s with modifiable threshold
threshold = 0.45
for i in range(len(y_valLSTM)):
    if y_valLSTM[i][1] > threshold:
        y_1s.append(y_valLSTM[i][1])
        y_val_lower.append(1)
    else:
         y_val_lower.append(0)
         y_0s.append(y_valLSTM[i][1])

conf_mat = confusion_matrix(val_labels, y_val_lower)
print("Validation data confusion matrix:")
print(conf_mat)
accuracy = (conf_mat[0][0] + conf_mat[1][1])/(conf_mat[0][0] + conf_mat[0][1] + conf_mat[1][0] + conf_mat[1][1]) #TP + TN / TP + FN + FP + TN
precision = conf_mat[1][1] / (conf_mat[1][1] + conf_mat[0][1]) # TP / TP + FP
recall = conf_mat[1][1] / (conf_mat[1][1] + conf_mat[1][0]) # TP / TP + FN
f1_measure = 2 / ((1/precision) + (1/recall))
print("Accuracy:", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-measure:", f1_measure)

# Here we can see the split of the predctions for different ranges (to see if the model is close to converging or not)
g00 = 0
g01 = 0
g02 = 0
g03 = 0
g04 = 0
g05 = 0
g06 = 0
g07 = 0
g08 = 0
g09 = 0
for i in y_0s:
    if i > threshold - 0.1:
        g04+=1
    elif i > threshold - 0.2:
        g03+=1
    elif i > threshold - 0.3:
        g02+=1
    elif i > threshold - 0.4:
        g01+=1
    else:
        g00+=1
for i in y_1s:
    if i < threshold + 0.1:
        g05+=1
    elif i < threshold + 0.2:
        g06+=1
    elif i < threshold + 0.3:
        g07+=1
    elif i < threshold + 0.4:
        g08+=1
    else:
        g09+=1

print()
print("Negatives: ")
print("Less than", round(threshold - 0.4, 2))
print(g00)
print(g01, round(threshold - 0.4, 2))
print(g02, round(threshold - 0.3, 2))
print(g03, round(threshold - 0.2, 2))
print(g04, round(threshold - 0.1, 2))
print("Positives: ")
print(g05, round(threshold + 0.1, 2))
print(g06, round(threshold + 0.2, 2))
print(g07, round(threshold + 0.3, 2))
print(g08, round(threshold + 0.4, 2))

print("More than", round(threshold + 0.4, 2))
print(g09)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step
Validation data confusion matrix:
[[689 185]
 [168 481]]
Accuracy: 0.768220617202889
Precision:  0.7222222222222222
Recall:  0.7411402157164869
F1-measure: 0.7315589353612167

Negatives: 
Less than 0.05
65
311 0.05
200 0.15
168 0.25
113 0.35
Positives: 
105 0.55
73 0.65
80 0.75
87 0.85
More than 0.85
321


#### Model testing

In [13]:
# Process the test data
dataTest['text'] = dataTest['text'].apply(preprocess_text)
dataTest = dataTest.drop(columns="location")
dataTest = dataTest.drop(columns="keyword")
test_sequences = dataTest
test_sequences = tokenize_and_prepare(test_sequences)
test_sequences = pad_sequences(
    dataTest['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True)),
    maxlen=max_len, padding='post'
)

X_testLSTM = test_sequences
# Predict the labels of test data
y_predsLSTM = modelLSTM.predict(X_testLSTM)

# Transform the predictions into 0s and 1s with modifiable threshold
y_test_higher = []
for i in range(len(y_predsLSTM)):
    if y_predsLSTM[i][1] > 0.45:
         y_test_higher.append(1)
    else:
         y_test_higher.append(0)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 122ms/step


In [14]:
# Display the 0-1 split on training and validation data

print(train_labels.value_counts())
print(val_labels.value_counts())

# See ho many negatives and positives were predicted on training, validation and test data
print("train")
print(y_trainLSTM.count(0))
print(y_trainLSTM.count(1))
print()
print("val")
print(y_val_lower.count(0))
print(y_val_lower.count(1))
print()
print("test")
print(y_test_higher.count(0))
print(y_test_higher.count(1))

label
1    3468
0    3468
Name: count, dtype: int64
label
0    874
1    649
Name: count, dtype: int64
train
3722
3214

val
857
666

test
1864
1399


#### Submission, review, reporting and ...

In [15]:
# Convert results to a csv file to submit to Kaggle
test_results = pd.DataFrame({'id':dataTest['id'], 'target':y_test_higher})
test_results.to_csv("submissions/submission27.csv", index=False)

#TODO: Ühendada veebilehega et me saaks postrisessil demo näidata