# [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/competitions/nlp-getting-started/overview)

## Imports

In [None]:
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/accelerate.git

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import TextClassificationPipeline

from datasets import Dataset

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

nltk.download('stopwords')

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stopWords = set(stopwords.words("english"))

## Dataset and Pre-Processing

In [3]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
train_df.drop(columns=["location"], inplace=True)
test_df.drop(columns=["location"], inplace=True)
train_df = train_df[train_df["keyword"].isnull() == False]

In [6]:
def cleanAndPreProcess(sentence):
    sentence = str(sentence).lower()
    sentence = ' '.join(re.findall("[A-Za-z]{1,}", str(re.sub(r"http\S+", "", str(sentence)))))
    words = list(word_tokenize(sentence))
    list_words = [w for w in words if w not in stopWords]
    return ' '.join(list_words)

In [7]:
train_df["clean"] = train_df["text"].apply(cleanAndPreProcess)
test_df["clean"] = test_df["text"].apply(cleanAndPreProcess)

## Pytorch Custom Model

In [8]:
tokenizer = Tokenizer(num_words = 10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["clean"])

In [9]:
train_sequences = tokenizer.texts_to_sequences(train_df["clean"])
train_pad = pad_sequences(train_sequences, maxlen=40, padding="post", truncating="post")

In [10]:
test_sequences = tokenizer.texts_to_sequences(test_df["clean"])
test_pad = pad_sequences(test_sequences, maxlen=40, padding="post", truncating="post")

In [11]:
X_train = torch.Tensor(train_pad).type(torch.int)
y_train = torch.Tensor(train_df["target"].values)

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [12]:
class DisasterClassifier(nn.Module):
    
    def __init__(self):
        super(DisasterClassifier, self).__init__()
        
        self.embedding_layer = nn.Embedding(10000, 64)
        self.lstm = nn.LSTM(64, 64, bidirectional=True, batch_first=True)
        self.conv1d = nn.Conv1d(40, 64, 10)
        self.relu = nn.PReLU()
        self.flatten = nn.Flatten()
        self.dense1 = nn.Linear(7616, 512)
        self.dropout1 = nn.Dropout(0.2)
        self.dense2 = nn.Linear(512, 32)
        self.dropout2 = nn.Dropout(0.2)
        self.dense3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embedding_out = self.embedding_layer(x)
        lstm_out,_ = self.lstm(embedding_out)
        conv_out = self.relu(self.conv1d(lstm_out))
        flat_out = self.flatten(conv_out)
        dense_out1 = self.dropout1(self.dense1(flat_out))
        dense_out2 = self.dropout2(self.dense2(dense_out1))
        out = self.sigmoid(self.dense3(dense_out2))
        return out

In [13]:
model = DisasterClassifier()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
n_epochs = 25

In [14]:
for epoch in range(n_epochs):
    training_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.reshape(-1,1))
        loss.backward()
        optimizer.step()
        training_loss += loss.item()
    
    epoch_loss = training_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{n_epochs} - Loss: {epoch_loss}")

Epoch 1/25 - Loss: 0.6348030301979033
Epoch 2/25 - Loss: 0.4681513832282212
Epoch 3/25 - Loss: 0.3428021266551341
Epoch 4/25 - Loss: 0.23256399417769605
Epoch 5/25 - Loss: 0.16168837237834804
Epoch 6/25 - Loss: 0.12710489483818538
Epoch 7/25 - Loss: 0.09955678210530619
Epoch 8/25 - Loss: 0.07472047223453016
Epoch 9/25 - Loss: 0.06384552311363346
Epoch 10/25 - Loss: 0.05188424452587812
Epoch 11/25 - Loss: 0.05127861446866734
Epoch 12/25 - Loss: 0.053476772213628974
Epoch 13/25 - Loss: 0.04622272006692306
Epoch 14/25 - Loss: 0.04172813624177637
Epoch 15/25 - Loss: 0.0381583273432995
Epoch 16/25 - Loss: 0.03471686386827491
Epoch 17/25 - Loss: 0.03391814491571834
Epoch 18/25 - Loss: 0.058143377265477574
Epoch 19/25 - Loss: 0.09956703473783247
Epoch 20/25 - Loss: 0.10753664115413736
Epoch 21/25 - Loss: 0.057035554869244
Epoch 22/25 - Loss: 0.03783325441068204
Epoch 23/25 - Loss: 0.03198438651671202
Epoch 24/25 - Loss: 0.03143758807362832
Epoch 25/25 - Loss: 0.030601340922519246


In [15]:
model.eval()
with torch.no_grad():
    y_pred = model(X_train)
    y_pred = torch.round(y_pred)
    accuracy = (y_pred[:,0] == y_train).sum().item()
    accuracy = accuracy / len(y_train)
    print(f"Train Accuracy: {accuracy}")

Train Accuracy: 0.9850370762711864


In [16]:
model.eval()
with torch.no_grad():
    X_test = torch.Tensor(test_pad).type(torch.int)
    y_test_pred = model(X_test)
    y_test_pred = torch.round(y_test_pred).type(torch.int)

In [None]:
submission_df = pd.DataFrame({'id': test_df['id'], 'target': y_test_pred[:, 0]})
submission_df.to_csv('/kaggle/working/submission1_NLPwDT.csv',index=False)

## HuggingFace Transformers

In [17]:
df = train_df[["clean", "target"]]
df.rename(columns={"clean": "text", "target": "label"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"clean": "text", "target": "label"}, inplace=True)


In [18]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [20]:
df_train, df_val = train_test_split(df, test_size=0.2)

train_ds = Dataset.from_pandas(df_train, split="train")
test_ds = Dataset.from_pandas(df_val, split="test")

tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [21]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.

In [23]:
training_args = TrainingArguments(
    output_dir="/kaggle/working",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to="none",
    evaluation_strategy ='steps',
    metric_for_best_model='f1',
    logging_steps=100,
    load_best_model_at_end=True,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.556,0.459838,0.799471,0.749794,0.806394,0.700617
200,0.4504,0.447577,0.80278,0.763492,0.785948,0.742284
300,0.4296,0.42149,0.819987,0.776683,0.829825,0.729938
400,0.4236,0.428688,0.81403,0.77247,0.812606,0.736111
500,0.3728,0.429908,0.82131,0.782258,0.819257,0.748457
600,0.3805,0.447335,0.812707,0.782809,0.778626,0.787037
700,0.381,0.443359,0.825281,0.763016,0.912017,0.655864
800,0.3351,0.432634,0.829914,0.785297,0.856102,0.725309
900,0.3201,0.443711,0.830576,0.794543,0.827759,0.763889
1000,0.3238,0.465473,0.823958,0.791536,0.80414,0.779321


TrainOutput(global_step=3780, training_loss=0.24280354232384413, metrics={'train_runtime': 5754.3708, 'train_samples_per_second': 10.498, 'train_steps_per_second': 0.657, 'total_flos': 378581217617940.0, 'train_loss': 0.24280354232384413, 'epoch': 10.0})

In [24]:
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
scores = classifier(test_df["clean"].to_list())
scores_ = [int(label['label'].split("_")[1]) for label in scores]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
submission_df = pd.DataFrame({'id': test_df['id'], 'target': scores_})
submission_df.to_csv('/kaggle/working/submission1_NLPwDT.csv',index=False)