### Import Libraries

In [28]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader, TensorDataset


In [4]:
np.random.seed(0)
torch.manual_seed(0)

# Check if CUDA is available
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Using device = cpu


### Read in dataset

In [5]:
file_path_csv = "crisisnlp-disaster/cleaned_combined_dataset.csv"

df = pd.read_csv(file_path_csv)
df.head(10)

Unnamed: 0,text,label,text_clean,text_lists
0,tappanakii with happybirthday,0,tappanakii happybirthday,"['tappanakii', 'happybirthday']"
1,why you should start paying attention to pro s...,0,start paying attention pro softball sports earth,"['start', 'paying', 'attention', 'pro', 'softb..."
2,me before sending a risky text vs. me after it...,0,sending risky text vs pays,"['sending', 'risky', 'text', 'vs', 'pays']"
3,capt.erwin 👫💑,0,capterwin,['capterwin']
4,lot of blackberry lane winter theme cross stit...,0,lot blackberry lane winter theme cross stitch ...,"['lot', 'blackberry', 'lane', 'winter', 'theme..."
5,thinking about pulling up to philly tomorrow,0,thinking pulling philly tomorrow,"['thinking', 'pulling', 'philly', 'tomorrow']"
6,my nail hurts so bad.,0,nail hurts bad,"['nail', 'hurts', 'bad']"
7,that's good enough for me!,0,thats good enough,"['thats', 'good', 'enough']"
8,"usa bargains apple ipad air b retina t gen, wi...",0,usa bargains apple ipad air b retina gen wi-fi...,"['usa', 'bargains', 'apple', 'ipad', 'air', 'b..."
9,start buying aaa and indie games at ridiculous...,0,start buying aaa indie games ridiculous low pr...,"['start', 'buying', 'aaa', 'indie', 'games', '..."


In [None]:
def remove_punc_caps(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s-]|(?<=\s)-|-(?=\s)', '', text)
    return text

raw_text = df.copy()
raw_text = raw_text.dropna().reset_index(drop=True)
raw_text.head(10)

Unnamed: 0,text,label,text_clean,text_lists
0,tappanakii with happybirthday,0,tappanakii happybirthday,"['tappanakii', 'happybirthday']"
1,why you should start paying attention to pro s...,0,start paying attention pro softball sports earth,"['start', 'paying', 'attention', 'pro', 'softb..."
2,me before sending a risky text vs. me after it...,0,sending risky text vs pays,"['sending', 'risky', 'text', 'vs', 'pays']"
3,capt.erwin 👫💑,0,capterwin,['capterwin']
4,lot of blackberry lane winter theme cross stit...,0,lot blackberry lane winter theme cross stitch ...,"['lot', 'blackberry', 'lane', 'winter', 'theme..."
5,thinking about pulling up to philly tomorrow,0,thinking pulling philly tomorrow,"['thinking', 'pulling', 'philly', 'tomorrow']"
6,my nail hurts so bad.,0,nail hurts bad,"['nail', 'hurts', 'bad']"
7,that's good enough for me!,0,thats good enough,"['thats', 'good', 'enough']"
8,"usa bargains apple ipad air b retina t gen, wi...",0,usa bargains apple ipad air b retina gen wi-fi...,"['usa', 'bargains', 'apple', 'ipad', 'air', 'b..."
9,start buying aaa and indie games at ridiculous...,0,start buying aaa indie games ridiculous low pr...,"['start', 'buying', 'aaa', 'indie', 'games', '..."


### Split dataset into train and test

In [7]:
x, y = raw_text['text_clean'].values, raw_text['label'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')

shape of train data is (143052,)
shape of test data is (47685,)


In [None]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 6
HIDDEN_LAYERS = 6

def sentence_to_indices(sentence, word_to_ix):
    return [word_to_ix[word] for word in sentence.split() if word in word_to_ix]

# Function to pad sequences
def pad_sequence(sequence, seq_length):
    if len(sequence) < seq_length:
        sequence = sequence + [0] * (seq_length - len(sequence))  # Padding with 0s
    else:
        sequence = sequence[:seq_length]  # Truncating if too long
    return sequence


In [None]:
cv = CountVectorizer(min_df=20)
cv_matrix = cv.fit_transform(x_train.tolist())
all_texts_word_to_ix = cv.vocabulary_

vocab_size = len(all_texts_word_to_ix)

print("Vocabulary Size:", vocab_size)


Vocabulary Size: 6375


In [11]:
def process_data(text, label, word_to_ix, seq_length):
    indices = sentence_to_indices(text, word_to_ix)  # Convert text to indices
    if not indices:
        return None, None  # Skip empty sequences
    
    padded_sequence = pad_sequence(indices, seq_length)
    input_tensor = torch.tensor(padded_sequence, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.long)
    return input_tensor, label_tensor

In [12]:
seq_length = 50

# Process training data
train_data = [process_data(text, label, all_texts_word_to_ix, seq_length) for text, label in zip(x_train, y_train)]
train_inputs, train_labels = zip(*[d for d in train_data if d[0] is not None])
train_input_tensor = torch.stack(train_inputs)
train_label_tensor = torch.stack(train_labels)

# Process test data
test_data = [process_data(text, label, all_texts_word_to_ix, seq_length) for text, label in zip(x_test, y_test)]
test_inputs, test_labels = zip(*[d for d in test_data if d[0] is not None])
test_input_tensor = torch.stack(test_inputs)
test_label_tensor = torch.stack(test_labels)

# Print shapes to verify correctness
print("Train Input Tensor:", train_input_tensor.size())
print("Train Label Tensor:", train_label_tensor.size())
print("Test Input Tensor:", test_input_tensor.size())
print("Test Label Tensor:", test_label_tensor.size())

Train Input Tensor: torch.Size([140247, 50])
Train Label Tensor: torch.Size([140247])
Test Input Tensor: torch.Size([46732, 50])
Test Label Tensor: torch.Size([46732])


In [13]:
# Create DataLoader for batching
batch_size = 128
train_dataset = TensorDataset(train_input_tensor, train_label_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_input_tensor, test_label_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [16]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim=6):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.embedding = nn.Embedding(vocab_size, input_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, h0=None, c0=None):
        x = self.embedding(x)
        if h0 is None or c0 is None:
            h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
            c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
        
        out, (hn, cn) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        probabilities = self.log_softmax(out)

        return probabilities, hn, cn

In [17]:
model = LSTMModel(input_dim=64, hidden_dim=128, layer_dim=2, output_dim=6).to(device)
#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [18]:
train_label_tensor = train_label_tensor.to(torch.long).to(device)
train_input_tensor = train_input_tensor.to(torch.long).to(device)
train_dataset = TensorDataset(train_input_tensor, train_label_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

In [23]:
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch_inputs, batch_labels in train_loader:
        #batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)
        optimizer.zero_grad()

        outputs, h0, c0 = model(batch_inputs)

        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Convert output probabilities to class indices
        preds = torch.argmax(outputs, dim=1)  # Get highest probability class
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    #precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, '
              f'Accuracy: {accuracy:.4f}')


Epoch [10/50], Loss: 0.0054, Accuracy: 0.9983
Epoch [20/50], Loss: 0.0046, Accuracy: 0.9985
Epoch [30/50], Loss: 0.0039, Accuracy: 0.9986
Epoch [40/50], Loss: 0.0035, Accuracy: 0.9988
Epoch [50/50], Loss: 0.0036, Accuracy: 0.9988


### Save the model

In [24]:
checkpoint = {
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "input_dim": 64,
    "hidden_dim": 128,
    "layer_dim": 2,
    "output_dim": 6,
    "vocab_size": vocab_size,
    "epochs": num_epochs
}

torch.save(checkpoint, "lstm_checkpoint.pth")

### Load the model

In [None]:
checkpoint = torch.load("lstm_checkpoint.pth")

# Recreate the model using the saved metadata
model = LSTMModel(
    input_dim=checkpoint["input_dim"],
    hidden_dim=checkpoint["hidden_dim"],
    layer_dim=checkpoint["layer_dim"],
    output_dim=checkpoint["output_dim"]
).to(device)

# Load the trained weights
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()  # Set to evaluation mode

# Reload optimizer state if continuing training
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

### Test the Model

In [25]:
test_label_tensor = test_label_tensor.to(torch.long).to(device)
test_input_tensor = test_input_tensor.to(torch.long).to(device)
test_dataset = TensorDataset(test_input_tensor, test_label_tensor)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True, pin_memory=True)

In [30]:
model.eval()  # Set model to evaluation mode (important!)

all_preds = []
all_labels = []

with torch.no_grad():  # Disable gradient calculation for faster evaluation
    for batch_inputs, batch_labels in test_loader:
        batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)

        outputs, _, _ = model(batch_inputs)  # Get model predictions
        preds = torch.argmax(outputs, dim=1)  # Convert probabilities to class predictions

        all_preds.extend(preds.cpu().numpy())  # Move to CPU for metrics
        all_labels.extend(batch_labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
report = classification_report(all_labels, all_preds)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print("Classification Report:\n", report)


Test Accuracy: 0.9581
Test Precision: 0.9580
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97     23762
           1       0.96      0.95      0.95      5346
           2       0.94      0.91      0.92      3776
           3       0.96      0.93      0.95     10913
           4       0.94      0.92      0.93      1476
           5       0.94      0.94      0.94      1459

    accuracy                           0.96     46732
   macro avg       0.95      0.94      0.94     46732
weighted avg       0.96      0.96      0.96     46732

