# Disaster Tweets Classification using TinyBert

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

!pip install evaluate
!pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/twitter_disaster_tweets.csv", usecols=['text', 'target'])

df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df.isnull().sum()

df = df.sample(frac=1).reset_index(drop=True)

In [4]:
df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [5]:
from datasets import Dataset

df = df.rename(columns={'target': 'label'})

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size = 0.2)

dataset['train'][0]

{'text': "A look at state actions a year after Ferguson's upheaval http://t.co/TBQsqtmqV4",
 'label': 0}

In [6]:
id2label = {0:'general', 1: 'disaster'}
label2id = {'general': 0, 'disaster': 1}

# Data Tokenization

In [7]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

tokenizer(dataset['train'][0]['text'])

def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True, max_length=100)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

dataset

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1523
    })
})

# Building Model Evaluation Functions

In [8]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

2025-09-23 20:55:36.378711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758660936.693038      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758660936.783315      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script: 0.00B [00:00, ?B/s]

# Model Building

In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)


args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy='epoch',  
    logging_dir='./logs',         
    logging_steps=10,             
    disable_tqdm=False,           
    report_to="none"              
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    processing_class=tokenizer    
)

trainer.train()

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5264,0.490016,0.791858
2,0.4624,0.437219,0.823375
3,0.4479,0.428702,0.820749


TrainOutput(global_step=288, training_loss=0.5017589521076944, metrics={'train_runtime': 20.2862, 'train_samples_per_second': 900.613, 'train_steps_per_second': 14.197, 'total_flos': 42980020947360.0, 'train_loss': 0.5017589521076944, 'epoch': 3.0})

In [10]:
trainer.evaluate()

{'eval_loss': 0.42870232462882996,
 'eval_accuracy': 0.8207485226526592,
 'eval_runtime': 0.572,
 'eval_samples_per_second': 2662.476,
 'eval_steps_per_second': 41.956,
 'epoch': 3.0}

# Model Saving and Testing

In [13]:
save_dir = 'tinybert-disaster-tweet'
trainer.save_model('tinybert-disaster-tweet')

In [14]:
from transformers import pipeline
import torch

#=== Testing ===#

data = ['There is a fire in the building', 'I am happy today', 'I am sad today', 
          'I am not feeling well','There is a flood in the city, go to higher ground']

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-disaster-tweet', device=device)

classifier(data)

Device set to use cuda


[{'label': 'disaster', 'score': 0.8339528441429138},
 {'label': 'general', 'score': 0.8557567000389099},
 {'label': 'general', 'score': 0.849732518196106},
 {'label': 'general', 'score': 0.8563405275344849},
 {'label': 'disaster', 'score': 0.8347362279891968}]

In [15]:
import shutil

# === Save model + tokenizer ===
save_dir = 'tinybert-disaster-tweet'
trainer.save_model(save_dir)           
tokenizer.save_pretrained(save_dir)    

print(f"✅ Model and tokenizer saved in: {save_dir}")

# === Zip the folder ===
zip_filename = f"{save_dir}.zip"
shutil.make_archive(save_dir, 'zip', save_dir)
print(f"✅ Model zipped at: {zip_filename}")

# === Download ===
from IPython.display import FileLink
display(FileLink(zip_filename))

✅ Model and tokenizer saved in: tinybert-disaster-tweet
✅ Model zipped at: tinybert-disaster-tweet.zip


# Upload to AWS S3

In [1]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'mlops-test-abdullah'

def create_bucket(bucket_name: str):
    try:
        response = s3.list_buckets()
        if bucket_name in [bucket['Name'] for bucket in response['Buckets']]:
            print(f"Bucket \" {bucket_name} \" already exists")
        else:
            s3.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration={
                    'LocationConstraint': 'eu-north-1'
                }
            )
            print(f"Bucket \" {bucket_name} \" is created")
    except Exception as e:
        print(f"Error in creating bucket: {e}")


create_bucket(bucket_name)

Bucket " mlops-test-abdullah " already exists


# Upload Model to S3 Bucket

In [None]:
bucket_name = 'mlops-test-abdullah'
model_path = 'tinybert-disaster-tweet'

def upload_folder(Bucket, dir_path, s3_prefix):
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            filepath = os.path.join(root, file)
            s3_key = os.path.join(s3_prefix, file)

            s3.upload_file(filepath, Bucket, s3_key)

            
upload_folder(Bucket= bucket_name ,dir_path=model_path, s3_prefix="ml-models/tinybert-disaster-tweet/")