# Natural Language Processing with Disaster Tweets

### Team: Robyn Tomson and Otto-Cristofer Vanasaun

#### Downloading and importing essential dependencies and Python packages.

In [17]:
!pip install transformers datasets torch scikit-learn
!pip install tf-keras
!pip install tensorflow
!pip install transformers
!pip install --upgrade tensorflow transformers
import os
import re
import string
from time import time
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import accuracy_score, confusion_matrix

import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import F1Score

from transformers import (
    TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification,
    BertTokenizer, TFGPT2LMHeadModel, GPT2Tokenizer, TFAutoModelForSequenceClassification
)



#### Disabling warnings

In [18]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#### Reading data from csv files

In [19]:
dataTrain = pd.read_csv("data/train.csv")
dataTest = pd.read_csv("data/test.csv")
dataSampleSubmit = pd.read_csv("data/sample_submission.csv")

#### Data description and understanding

In [20]:
disasters = []
locations = [] 
total_words = Counter()
for row in dataTrain.itertuples(index=True):
    if pd.notna(row.keyword): 
        if row.keyword not in disasters:
            disasters.append(row.keyword)
    if pd.notna(row.location): 
        if row.location not in locations:
            locations.append(row.location)
            
    words = row.text.split() 
    total_words.update(words)
    
unique_word_count = len(total_words)


print(dataTrain.info())
print("________________________________________")
print("Total words:", sum(total_words.values()))
print("Unique words:", unique_word_count)
print("________________________________________")
print("Number of unique disasters:", len(disasters))
print("________________________________________")
print("Number of unique locations:", len(locations))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
________________________________________
Total words: 113461
Unique words: 31924
________________________________________
Number of unique disasters: 221
________________________________________
Number of unique locations: 3341


#### Data preparation

In [21]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
max_len = 1400

nltk.download('stopwords')

def preprocess_text(text):
     # Remove links, mentions, hashtags from tweet texts
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    
    # Remove special characters and numbers
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords (words that don't add anything meaningful to the text)
    stopword = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stopword])
    return text

def preprocess_data(data):
    data['text'].apply(preprocess_text)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['target'])
    data['text'] = data['text']
    return data, label_encoder

def tokenize_and_prepare(data, max_length=512):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    data['tokens'] = data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_length, truncation=True))
    token_sequences = pad_sequences(data['tokens'], maxlen=max_length, padding='post')
    return token_sequences


def eeltöötlus(trainset):
    trainset, label_encoder = preprocess_data(trainset)
    train_data, val_data = train_test_split(trainset, test_size=0.2, random_state=42)
    train_sequences = tokenize_and_prepare(train_data)
    val_sequences = tokenize_and_prepare(val_data)

    return train_sequences, val_sequences, label_encoder

train_sequences, val_sequences, label_encoder = eeltöötlus(dataTrain)
train_sequences = pad_sequences(
    dataTrain['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True)),
    maxlen=max_len, padding='post'
)


train_sequences, val_sequences, train_labels, val_labels = train_test_split(
    train_sequences, dataTrain['label'], test_size=0.2, random_state=42
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanasauo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Model building

In [25]:
vocab_size = 30522
embedding_dim = 256

modelLSTM = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(48, dropout=0.2, recurrent_dropout=0.2)),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(2, activation='softmax')
])

modelLSTM.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

algus = time()

try:
    history = modelLSTM.fit(
        train_sequences,
        train_labels,
        epochs=6,
        batch_size=32,
        validation_data=(val_sequences, val_labels),
        verbose=1
    )
except KeyError as e:
    print("KeyError encountered:", e)

aeg = time()-algus
print("Aega treenimiseks läks {} minutit ja {} sekundit.".format(round(aeg/60), round(aeg%60,2)))

Epoch 1/6
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m446s[0m 2s/step - accuracy: 0.6553 - loss: 0.6137 - val_accuracy: 0.7932 - val_loss: 0.4619
Epoch 2/6
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m479s[0m 3s/step - accuracy: 0.8588 - loss: 0.3519 - val_accuracy: 0.7859 - val_loss: 0.4825
Epoch 3/6
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m473s[0m 2s/step - accuracy: 0.9172 - loss: 0.2327 - val_accuracy: 0.7754 - val_loss: 0.5686
Epoch 4/6
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 2s/step - accuracy: 0.9487 - loss: 0.1481 - val_accuracy: 0.7656 - val_loss: 0.6817
Epoch 5/6
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m461s[0m 2s/step - accuracy: 0.9696 - loss: 0.0881 - val_accuracy: 0.7511 - val_loss: 0.7829
Epoch 6/6
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 2s/step - accuracy: 0.9748 - loss: 0.0746 - val_accuracy: 0.7498 - val_loss: 0.9490
Aega treenimiseks läks 46 mi

In [None]:
"""
X_testLSTM = test_sequences

y_predsLSTM = modelLSTM.predict(X_testLSTM)

y_predsLSTM = [np.argmax(vastus) for vastus in y_predsLSTM]

y_test = list(testset['label'])
#y_test = [labels.index(cat) for cat in y_test]

print("Täpsus:", accuracy_score(y_test, y_predsLSTM))


print("Sildid:", labels)
print("Segadusmaatriks:")
print(confusion_matrix(y_test, y_predsLSTM))
"""

#### Model training

In [None]:
#TODO

#### Model testing

In [None]:
test_sequences = tokenize_and_prepare(dataTest)
test_sequences = pad_sequences(
    dataTest['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True)),
    maxlen=max_len, padding='post'
)

X_testLSTM = test_sequences

y_predsLSTM = modelLSTM.predict(X_testLSTM)

y_predsLSTM = [np.argmax(vastus) for vastus in y_predsLSTM]

[1m 27/102[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m27s[0m 364ms/step

#### Submition, review, reporting and ...

In [None]:
test_results = pd.DataFrame({'id':dataTest['id'], 'target':y_predsLSTM})
test_results.to_csv("submission4.csv", index=False)

#TODO: Saada f-measure 0.8 peale
#TODO: Ühendada veebilehega et me saaks postrisessil demo näidata
#TODO: See for-loop siia et ta pakuks mis tüüpi disaster on