Deep Learning for Disaster Tweet Classification - Machine learning Final 
Charisma Ricarte
Trieu Do
Jonathan Garcia	

Dataset source: https://www.kaggle.com/datasets/vstepanenko/disaster-tweets?select=tweets.csv 


In [13]:
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))

CUDA available: True
Device: NVIDIA A30


In [2]:
import pandas as pd
dataset = pd.read_csv("tweets.csv", # the location to the data file
                       sep=",", nrows = 10000
                       )
dataset

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
...,...,...,...,...,...
9995,9995,terrorism,,3yrs after IPOB formed Biafra security Service...,0
9996,9996,terrorism,www,France agrees to send more troops to West Afri...,1
9997,9997,terrorism,USA,"While the press feasts off a tiny ""he-said, sh...",0
9998,9998,terrorism,North Pole,‚óè NEWS ‚óè #meduza #russia ‚òû Man who made Russia...,0


In [3]:
# clean text of symbols and non-letters, etc. 
# should help standardize words, for example: "FIRE!!!", "fire.", and "fireüî•" now all map to "fire"
# helps prevent un needed token usage 

import re        # for regular expressions (text cleaning)

def clean_text(t):
    t = t.lower()
    t = re.sub(r"http\S+", "", t)  # remove URLs
    t = re.sub(r"@\w+", "", t)     # remove mentions
    t = re.sub(r"#", "", t)        # remove hashtag symbols (keep the word)
    t = re.sub(r"[^a-z\s]", "", t) # remove non-letters
    return t.strip()
dataset["clean_text"] = dataset["text"].apply(clean_text)

In [4]:
# import libraries to clean and prepare our dataset for our models - Bag of words binary feature matrix 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
vectorizer = CountVectorizer(binary=True, stop_words="english") # 1 or 0 indicating if word appears in tweet and removes english words like "the", "and" and "is"
X = vectorizer.fit_transform(dataset["clean_text"]) # converts each tweet into a vector of 0s and 1s
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out()) # takes every word and places it as a column name 
df_tf # predictors (independent variables)

Unnamed: 0,aa,aaaaaaaaacccccckkkkkkkk,aab,aadharcard,aalaathun,aampe,aampes,aap,aaron,aayega,...,zonal,zone,zoo,zoom,zoomedin,zorro,zuckerberg,zulaykhas,zuma,zw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Decided to try a different vectorizer - TF-IDF - weighted feature matrix 
This vectorizer will assign weights to words. Words with more importance get a higher weight while words with less importance get a smaller weight. 
It performs much better than the binary vectorizer above. There are more meaningful words found, and this will help in training models. 

In [5]:
vectorizer = TfidfVectorizer(
    norm="l2",
    stop_words="english",
    ngram_range=(1,2),      # include unigrams + bigrams
    max_features=20000,     # cap vocab size - helps remove characters or words that have no meaning or very rare occurence
    lowercase=True
)
X = vectorizer.fit_transform(dataset["clean_text"])  # use your cleaned text column

# take a look at the first 100 rows of new dataframe 
pd.DataFrame(X[:100].toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,aab battle,aadharcard,aadharcard ad,aalaathun,aalaathun balaa,aampe,aampe life,aampes,aampes queen,aap,...,zombies run,zombies time,zombievirus,zonal,zonal cooling,zone,zoo,zoom,zuma,zuma did
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Partition the data set
# create pipeline to prevent data leakage 
# Setup baseline - logistic regression 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_text = dataset["clean_text"]  
y = dataset["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X_text, y, test_size=0.20, random_state=123, stratify=y
)

# Logistic Regression model- baseline

pipe = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=20000),
    LogisticRegression(max_iter=1000, class_weight="balanced")
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.942     0.910     0.926      1647
           1      0.638     0.739     0.685       353

    accuracy                          0.880      2000
   macro avg      0.790     0.825     0.805      2000
weighted avg      0.889     0.880     0.883      2000



0 = Non-disaster tweets 
1 = Disaster Tweets 
Precision = out of all tweets predicted as ‚Äúdisaster,‚Äù how many actually were?
Recall = out of all real disaster tweets, how many did the model correctly identify?
F1-score = harmonic mean of precision & recall (balances both)
Support = number of true samples in that class

1st run 
The LR model is performing well overall for a 70/20 split 
It does really well at finding non-disaster tweets with an F1-score of 92% and ok at finding disaster tweets at 67% 
The support reveals that there were over 2400 samples of 

              precision    recall  f1-score   support

           0      0.936     0.913     0.924      2470
           1      0.635     0.708     0.669       530

    accuracy                          0.876      3000
   macro avg      0.785     0.810     0.797      3000
weighted avg      0.882     0.876     0.879      3000


2nd run - will use this one for baseline
LR model baseline improved with an 80/20 split. 
The F1 score improved for disaster tweet detection by almost 1 point. Accuracy, macro avg, and weighted average also improved slightly. 

 precision    recall  f1-score   support

           0      0.940     0.909     0.924      1647
           1      0.631     0.728     0.676       353

    accuracy                          0.877      2000
   macro avg      0.786     0.818     0.800      2000
weighted avg      0.885     0.877     0.880      2000

Setup environment for Transformer Model 

install transformer library

open terminal 


conda activate # shouldnt need this - > <your-env-name>
    
conda install -c huggingface transformers datasets accelerate

In [31]:
import os, textwrap, sys

print("Repo root contents:", os.listdir("."))
if "src" in os.listdir("."):
    print("src contents:", os.listdir("src"))
else:
    print("No src folder found")

# Quick search for anything with 'transformer' in the code
!grep -R "transformer" -n src || echo "No 'transformer' string found in src"

Repo root contents: ['tweets.csv', 'src', 'Disaster Tweets ML Final.ipynb', 'Untitled.ipynb', 'main.py', 'requirements.txt', 'README.md', '.ipynb_checkpoints', 'training_loss_curve.png', 'data', '__pycache__', 'eda_target_distribution.png', 'metrics_summary.png']
src contents: ['utils.py', 'train_eval.py', 'model.py', '__pycache__', 'preprocess.py']

Binary file src/__pycache__/preprocess.cpython-310.pyc matches
Binary file src/__pycache__/preprocess.cpython-312.pyc matches
Binary file src/__pycache__/model.cpython-310.pyc matches
Binary file src/__pycache__/model.cpython-312.pyc matches



# install transformer library
open terminal 
cd ml_final
conda create --prefix ./env python=3.10 -y
conda activate ./env 
conda install -c conda-forge transformers -y
conda install scikit-learn -y
conda install matplotlib -y

In [37]:
import sys
print("Python executable for this notebook:", sys.executable)

# Install transformers into THIS exact Python env
!{sys.executable} -m pip install transformers

Python executable for this notebook: /scratch/user/u.jg335414/.conda/envs/pytorch_env/bin/python


In [36]:
!{sys.executable} -m pip install datasets accelerate



In [44]:

!conda update -n base -c defaults conda -y


Collecting package metadata (current_repodata.json): done
Solving environment: - 
  - defaults/linux-64::markdown-3.8-py311h06a4308_0, defaults/linux-64::markdown-it-py-2.2.0-py311h06a4308_1
  - defaults/linux-64::markdown-3.4.1-py311h06a4308_0, defaults/linux-64::markdown-it-py-4.0.0-py311h06a4308done


  current version: 23.7.4
  latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=25.9.1



## Package Plan ##

  environment location: /sw/eb/sw/Anaconda3/2023.09-0

  added / updated specs:
    - conda


The following NEW packages will be INSTALLED:

  aiodns             pkgs/main/linux-64::aiodns-3.5.0-py311h06a4308_1 
  aiohappyeyeballs   pkgs/main/linux-64::aiohappyeyeballs-2.6.1-py311h06a4308_0 
  astropy-iers-data  pkgs/main/linux-64::astropy-iers-data-0.2025.11.10.0.38.31-py311h06a4308_0 
  async-lru          pkgs/main/linux-64::async-l

In [39]:
!conda install -c huggingface transformers -y

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.7.4
  latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=25.9.1



## Package Plan ##

  environment location: /sw/eb/sw/Anaconda3/2023.09-0

  added / updated specs:
    - transformers


The following packages will be UPDATED:

  ca-certificates                     2023.08.22-h06a4308_0 --> 2025.11.4-h06a4308_0 
  certifi                         2023.7.22-py311h06a4308_0 --> 2025.11.12-py311h06a4308_0 
  openssl                                 3.0.10-h7f8727e_2 --> 3.0.18-hd6dcaed_0 



Downloading and Extracting Packages

Preparing transaction: done
Verifying transaction: failed

EnvironmentNotWritableError: The current user does not have write permissions to the target environment.
  environment location: /sw/eb/sw/Anaconda3/2023.09

In [45]:
# import libraries needed for transformer model 
import transformers
import preprocess
import model
import train_eval

print("All imports succeeded ‚úÖ")

ModuleNotFoundError: No module named 'preprocess'

In [None]:
import sys, os

# Make sure we're in the repo root 
print("CWD:", os.getcwd())
print("Contents:", os.listdir("."))

# So that `src.*` imports work
sys.path.append(os.path.abspath("."))

from src.preprocess import load_and_preprocess
from src.model import create_model
from src.train_eval import train_model
from src.utils import plot_metrics

In [37]:
# Run transformer model 
# 1. Load and preprocess the data
train_enc, val_enc, train_labels, val_labels, num_labels = load_and_preprocess("data/tweets.csv")

# 2. Create the BERT model
model = create_model(num_labels)

# (Optional) explicitly move to GPU, if train_model doesn't already do it inside
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

# 3. Train the model
train_losses, val_accs = train_model(
    model,
    train_enc,
    val_enc,
    train_labels,
    val_labels,
)

# 4. Plot metrics (this likely saves a PNG and/or shows a plot)
plot_metrics(train_losses, val_accs)

print("Training complete. Metrics and plots saved.")

Reading file from: /ccstar/scratch/user/u.jg335414/ml_final/ML_DisasterResponse-main/data/tweets.csv
Dataset shape: (11370, 5)
   id keyword        location  \
0   0  ablaze             NaN   
1   1  ablaze             NaN   
2   2  ablaze   New York City   
3   3  ablaze  Morgantown, WV   
4   4  ablaze             NaN   

                                                text  target  
0  Communal violence in Bhainsa, Telangana. "Ston...       1  
1  Telangana: Section 144 has been imposed in Bha...       1  
2  Arsonist sets cars ablaze at dealership https:...       1  
3  Arsonist sets cars ablaze at dealership https:...       1  
4  "Lord Jesus, your love brings freedom and pard...       0  


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


Epoch 1/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 569/569 [00:46<00:00, 12.14it/s]


Epoch 1 - Loss: 0.2796, Val Acc: 0.8993


Epoch 2/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 569/569 [00:47<00:00, 12.10it/s]


Epoch 2 - Loss: 0.1498, Val Acc: 0.9103
Training complete. Metrics and plots saved.


In [38]:
import inspect
import src.train_eval as train_eval

print(inspect.signature(train_eval.train_model))

(model, train_enc, val_enc, train_labels, val_labels, epochs=2, batch_size=16, lr=2e-05)


In [39]:
train_losses, val_accs = train_model(
    model,
    train_enc,
    val_enc,
    train_labels,
    val_labels,
    epochs=3,
    batch_size=16,
)

Epoch 1/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 569/569 [00:46<00:00, 12.24it/s]


Epoch 1 - Loss: 0.0675, Val Acc: 0.9099


Epoch 2/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 569/569 [00:46<00:00, 12.15it/s]


Epoch 2 - Loss: 0.0353, Val Acc: 0.8936


Epoch 3/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 569/569 [00:46<00:00, 12.11it/s]


Epoch 3 - Loss: 0.0229, Val Acc: 0.9028


In [18]:
import preprocess
import model
import train_eval

print("preprocess:", dir(preprocess)[:20])
print("model:", dir(model)[:20])
print("train_eval:", dir(train_eval)[:20])

preprocess: ['BertTokenizer', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'load_and_preprocess', 'os', 'pd', 'plt', 'train_test_split']
model: ['BertForSequenceClassification', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'create_model', 'nn', 'torch']
train_eval: ['DataLoader', 'TensorDataset', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'plt', 'torch', 'tqdm', 'train_model']


In [1]:
import inspect
import src.train_eval as tv

print(inspect.getsource(tv.train_model))

def train_model(model, train_enc, val_enc, train_labels, val_labels, epochs=2, batch_size=16, lr=2e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = torch.nn.CrossEntropyLoss()

    # Prepare data loaders
    train_data = TensorDataset(train_enc["input_ids"], train_enc["attention_mask"], torch.tensor(train_labels))
    val_data = TensorDataset(val_enc["input_ids"], val_enc["attention_mask"], torch.tensor(val_labels))

    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size)

    train_losses, val_accs = [], []

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            b_input_ids, b_attn_mask, b_labels = [t.to(device) for t in batch]

            optimizer.zero_grad()
        

In [25]:
import preprocess

result = preprocess.load_and_preprocess("tweets.csv")
print("Number of returned objects:", len(result))
for i, item in enumerate(result):
    print(i, type(item))

ModuleNotFoundError: No module named 'preprocess'

In [5]:
tokenizer = tokenizer  # from your earlier code
print(tokenizer)

NameError: name 'tokenizer' is not defined