# Natural Language Processing with Disaster Tweets

### Team: Robyn Tomson and Otto-Cristofer Vanasaun

#### Downloading and importing essential dependencies and Python packages.

In [1]:
!pip install imblearn
!pip install transformers datasets torch scikit-learn
!pip install tf-keras
!pip install tensorflow
!pip install transformers
!pip install --upgrade tensorflow transformers
import os
import re
import string
from time import time
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import accuracy_score, confusion_matrix

from imblearn.over_sampling import SMOTE

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from transformers import (
    TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification,
    BertTokenizer, TFGPT2LMHeadModel, GPT2Tokenizer, TFAutoModelForSequenceClassification
)



2024-12-08 14:51:23.570333: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


#### Disabling warnings

In [2]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#### Reading data from csv files

In [3]:
dataTrain = pd.read_csv("data/train.csv")
dataTest = pd.read_csv("data/test.csv")
dataSampleSubmit = pd.read_csv("data/sample_submission.csv")

#### Data description and understanding

In [4]:
disasters = []
locations = [] 
total_words = []
for row in dataTrain.itertuples(index=True):
    if pd.notna(row.keyword): 
        if row.keyword not in disasters:
            disasters.append(row.keyword)
    if pd.notna(row.location): 
        if row.location not in locations:
            locations.append(row.location)
            
    words = row.text.split() 
    for word in words:
        total_words.append(word)
    
    
unique_word_count = set(total_words)

# Get the words occurring the most in the data
words_df = {'word':total_words}
words_df = pd.DataFrame(data=words_df)
populars = words_df.value_counts()


print(dataTrain.info())
print("________________________________________")
print("Total words:", len(total_words))
print("Unique words:", len(unique_word_count))
print("________________________________________")
print("Number of unique disasters:", len(disasters))
print("________________________________________")
print("Number of unique locations:", len(locations))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
________________________________________
Total words: 113461
Unique words: 31924
________________________________________
Number of unique disasters: 221
________________________________________
Number of unique locations: 3341


In [5]:
import matplotlib as plt
dataTrain


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [6]:
import numpy as np
print(np.__version__)

1.26.4


#### Data preparation

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
max_len = 1400



nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
     # Remove links from tweet texts
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    
    # Remove special characters, mentions and hashtages, as well as numbers from tweet texts
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords (words that don't add anything meaningful to the text)
    stopword = set(stopwords.words('english'))
    # Manually added stopwords
    extrastopwords = ["im", "like", "get", "dont", "wont", "via", "still", "would", "got", "rt", "cant", "theyre", "bb", "fyi", "hmu", "th", "st", "rd"]
    extrastopwords = set(extrastopwords)
    text = " ".join([word for word in text.split() if word not in stopword])
    text = " ".join([word for word in text.split() if word not in extrastopwords])

    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    text_lemmas = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    text_lemmas = " ".join([word for word in text_lemmas])
    return text_lemmas

def oversampling(data, labels):
    X_SMOTE, y_SMOTE = SMOTE(k_neighbors=5).fit_resample(data, labels)
    return X_SMOTE, y_SMOTE
    
def preprocess_data(data):
    data['text'] = data['text'].apply(preprocess_text)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['target'])

    
    return data, label_encoder

def tokenize_and_prepare(data, max_length=512):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    data['tokens'] = data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_length, truncation=True))
    token_sequences = pad_sequences(data['tokens'], maxlen=max_length, padding='post')
    return token_sequences


def eeltöötlus(trainset):
    trainset, label_encoder = preprocess_data(trainset)
    train_data, val_data = train_test_split(trainset, test_size=0.2, random_state=42)
    train_labels = train_data['label']
    val_labels = val_data['label']
   # print(train_data['text'])
    
    train_sequences = tokenize_and_prepare(train_data)
    val_sequences = tokenize_and_prepare(val_data)
    train_sequences = pad_sequences(
    train_data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True)),
    maxlen=max_len, padding='post'
    )
    train_sequences, train_labels = oversampling(train_sequences, train_labels)
    return train_sequences, val_sequences, train_labels, val_labels, label_encoder

train_sequences, val_sequences, train_labels, val_labels, label_encoder = eeltöötlus(dataTrain)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robyniusmaximus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/robyniusmaximus/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/robyniusmaximus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/robyniusmaximus/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [8]:
print(train_sequences.shape)
print(train_labels.shape)
print(val_sequences.shape)
print(val_labels.shape)
train_labels.value_counts()

(6936, 1400)
(6936,)
(1523, 512)
(1523,)


label
1    3468
0    3468
Name: count, dtype: int64

#### Model building

In [9]:
vocab_size = 30522
embedding_dim = 256


initial_learning_rate = 0.00001
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)

modelLSTM = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(32, dropout=0.1)),
    #BatchNormalization(),
    Dense(24, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    #Dropout(0.1),
    Dense(12, activation='relu'),
    Dense(2, activation='softmax')
])

modelLSTM.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=lr_schedule), metrics=['accuracy'])

#### Model training

In [10]:
algus = time()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
try:
    history = modelLSTM.fit(
        train_sequences,
        train_labels,
        epochs=4,
        batch_size=32,
        validation_data=(val_sequences, val_labels),
        callbacks=[early_stopping],
        verbose=1
    )
except KeyError as e:
    print("KeyError encountered:", e)

aeg = time()-algus
print("Aega treenimiseks läks {} minutit ja {} sekundit.".format(round(aeg/60), round(aeg%60,2)))

Epoch 1/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 895ms/step - accuracy: 0.5079 - loss: 1.0382 - val_accuracy: 0.5745 - val_loss: 1.0286
Epoch 2/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 875ms/step - accuracy: 0.5118 - loss: 1.0285 - val_accuracy: 0.5778 - val_loss: 1.0185
Epoch 3/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 916ms/step - accuracy: 0.5216 - loss: 1.0190 - val_accuracy: 0.6067 - val_loss: 1.0082
Epoch 4/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 901ms/step - accuracy: 0.5491 - loss: 1.0090 - val_accuracy: 0.6310 - val_loss: 0.9969
Aega treenimiseks läks 13 minutit ja 9.89 sekundit.


In [11]:
X_trainLSTM = train_sequences
y_trainLSTM = modelLSTM.predict(X_trainLSTM)
y_trainLSTM = [np.argmax(vastus) for vastus in y_trainLSTM]
        
conf_mat = confusion_matrix(train_labels, y_trainLSTM)
print("Training data confusion matrix:")
print(conf_mat)
"""
Confusion matrix structure:
TN FP
FN TP
"""
accuracy = (conf_mat[0][0] + conf_mat[1][1])/(conf_mat[0][0] + conf_mat[0][1] + conf_mat[1][0] + conf_mat[1][1])
precision = conf_mat[1][1] / (conf_mat[1][1] + conf_mat[0][1])
recall = conf_mat[1][1] / (conf_mat[1][1] + conf_mat[1][0])
f1_measure = 2 / ((1/precision) + (1/recall))
print("Accuracy:", accuracy) #TP + TN / TP + FN + FP + TN
print("Precision: ", precision) # TP / TP + FP
print("Recall: ", recall) # TP / TP + FN
print("F1-measure:", f1_measure)

[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 148ms/step
Training data confusion matrix:
[[3319  149]
 [2715  753]]
Accuracy: 0.5870818915801614
Precision:  0.8348115299334812
Recall:  0.2171280276816609
F1-measure: 0.34462242562929063


In [12]:
X_valLSTM = val_sequences

y_valLSTM = modelLSTM.predict(X_valLSTM)
y_val_lower = []
y_1s = []
threshold = 0.5
for i in range(len(y_valLSTM)):
    if y_valLSTM[i][1] > threshold:
        y_1s.append(y_valLSTM[i][1])
        y_val_lower.append(1)
    else:
         y_val_lower.append(0)

conf_mat = confusion_matrix(val_labels, y_val_lower)
print("Validation data confusion matrix:")
print(conf_mat)
accuracy = (conf_mat[0][0] + conf_mat[1][1])/(conf_mat[0][0] + conf_mat[0][1] + conf_mat[1][0] + conf_mat[1][1])
precision = conf_mat[1][1] / (conf_mat[1][1] + conf_mat[0][1])
recall = conf_mat[1][1] / (conf_mat[1][1] + conf_mat[1][0])
f1_measure = 2 / ((1/precision) + (1/recall))
print("Accuracy:", accuracy) #TP + TN / TP + FN + FP + TN
print("Precision: ", precision) # TP / TP + FP
print("Recall: ", recall) # TP / TP + FN
print("F1-measure:", f1_measure)
g05 = 0
g06 = 0
g07 = 0
g08 = 0
g09 = 0
for i in y_1s:
    if i < threshold + 0.1:
        g05+=1
    elif i < threshold + 0.2:
        g06+=1
    elif i < threshold + 0.3:
        g07+=1
    elif i < threshold + 0.4:
        g08+=1
    else:
        g09+=1

print(g05, threshold + 0.1)
print(g06, threshold + 0.2)
print(g07, threshold + 0.3)
print(g08, threshold + 0.4)
print("More than", threshold + 0.4)
print(g09)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 93ms/step
Validation data confusion matrix:
[[835  39]
 [523 126]]
Accuracy: 0.6309914642153645
Precision:  0.7636363636363637
Recall:  0.1941448382126348
F1-measure: 0.3095823095823096
165 0.6
0 0.7
0 0.8
0 0.9
More than 0.9
0


#### Model testing

In [13]:
dataTest['text'] = dataTest['text'].apply(preprocess_text)
test_sequences = dataTest
test_sequences = tokenize_and_prepare(test_sequences)
test_sequences = pad_sequences(
    dataTest['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True)),
    maxlen=max_len, padding='post'
)

X_testLSTM = test_sequences

y_predsLSTM = modelLSTM.predict(X_testLSTM)


y_test_higher = []
for i in range(len(y_predsLSTM)):
    if y_predsLSTM[i][1] > 0.5:
         y_test_higher.append(1)
    else:
         y_test_higher.append(0)

#y_predsLSTM = [np.argmax(vastus) for vastus in y_predsLSTM]

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 182ms/step


In [14]:
print(train_labels.value_counts())
print(val_labels.value_counts())

print("train")
print(y_trainLSTM.count(0))
print(y_trainLSTM.count(1))
print()
print("val")
print(y_val_lower.count(0))
print(y_val_lower.count(1))
print()
print("test")
print(y_test_higher.count(0))
print(y_test_higher.count(1))

label
1    3468
0    3468
Name: count, dtype: int64
label
0    874
1    649
Name: count, dtype: int64
train
6034
902

val
1358
165

test
2913
350


#### Submission, review, reporting and ...

In [15]:
test_results = pd.DataFrame({'id':dataTest['id'], 'target':y_test_higher})
test_results.to_csv("submissions/submission19.csv", index=False)

#TODO: recall kõrgemaks. Ta ei converge ära(?)
#TODO: Saada f-measure 0.8 peale
#TODO: Ühendada veebilehega et me saaks postrisessil demo näidata
#TODO: See for-loop siia et ta pakuks mis tüüpi disaster on