In [1]:
from google.colab import files

uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [2]:
!kaggle datasets download -d vstepanenko/disaster-tweets

Dataset URL: https://www.kaggle.com/datasets/vstepanenko/disaster-tweets
License(s): CC0-1.0
Downloading disaster-tweets.zip to /content
  0% 0.00/656k [00:00<?, ?B/s]
100% 656k/656k [00:00<00:00, 31.5MB/s]


In [3]:
from zipfile import ZipFile
file_name = "/content/disaster-tweets.zip" # Changed the file name to the correct one.
with ZipFile(file_name,'r') as zip:
  zip.extractall()
  print('Done')

Done


In [3]:
!pip install pyarrow==17.0.0
!pip install transformers datasets




In [4]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, AdamW, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, DatasetDict
from transformers import get_scheduler


In [5]:
# Load the dataset
data = pd.read_csv('/content/tweets.csv')

# Display the first few rows of the dataset
print("Original Data:")
print(data.head())

# Keep only the 'text' and 'target' columns
data = data[['text', 'target']]

# Display the first few rows to confirm the changes
print("Data after removing unnecessary columns:")
print(data.head())


Original Data:
   id keyword        location  \
0   0  ablaze             NaN   
1   1  ablaze             NaN   
2   2  ablaze   New York City   
3   3  ablaze  Morgantown, WV   
4   4  ablaze             NaN   

                                                text  target  
0  Communal violence in Bhainsa, Telangana. "Ston...       1  
1  Telangana: Section 144 has been imposed in Bha...       1  
2  Arsonist sets cars ablaze at dealership https:...       1  
3  Arsonist sets cars ablaze at dealership https:...       1  
4  "Lord Jesus, your love brings freedom and pard...       0  
Data after removing unnecessary columns:
                                                text  target
0  Communal violence in Bhainsa, Telangana. "Ston...       1
1  Telangana: Section 144 has been imposed in Bha...       1
2  Arsonist sets cars ablaze at dealership https:...       1
3  Arsonist sets cars ablaze at dealership https:...       1
4  "Lord Jesus, your love brings freedom and pard...       0


In [6]:
# Separate features and labels
X = data['text'].tolist()  # Convert to list
y = data['target'].tolist()  # Convert to list

# Print the lengths of features and labels
print(f"Number of samples: {len(X)}")
print(f"Number of labels: {len(y)}")


Number of samples: 11370
Number of labels: 11370


In [7]:

# Create a Hugging Face Dataset
dataset = Dataset.from_dict({'text': X, 'label': y})

In [8]:
# Split the dataset
split_dataset = dataset.train_test_split(test_size=0.2)


In [9]:
# Prepare datasets for Hugging Face
dataset_dict = DatasetDict({
    'train': split_dataset['train'],
    'test': split_dataset['test']
})


In [10]:
# Check the length of each split in the dataset
train_length = len(dataset_dict['train'])
test_length = len(dataset_dict['test'])

print(f"Training dataset length: {train_length}")
print(f"Test dataset length: {test_length}")


Training dataset length: 9096
Test dataset length: 2274


In [11]:
# Load the BERT tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize function for text
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=128)

# Apply tokenization to the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Data collator to handle padding dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the BERT model for sequence classification
num_labels = len(set(y))  # Number of unique labels
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments
    train_dataset=tokenized_datasets["train"],         # training dataset
    eval_dataset=tokenized_datasets["test"],            # evaluation dataset
    data_collator=data_collator,        # data collator to handle dynamic padding
    tokenizer=tokenizer,                # tokenizer used for preprocessing
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/9096 [00:00<?, ? examples/s]

Map:   0%|          | 0/2274 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Train the model
trainer.train()

Step,Training Loss
10,0.6484
20,0.6098
30,0.5836
40,0.5698
50,0.531
60,0.4437
70,0.481
80,0.4534
90,0.5683
100,0.4442


TrainOutput(global_step=3411, training_loss=0.23508834635128428, metrics={'train_runtime': 772.4416, 'train_samples_per_second': 35.327, 'train_steps_per_second': 4.416, 'total_flos': 1794943619665920.0, 'train_loss': 0.23508834635128428, 'epoch': 3.0})

In [13]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

Evaluation Results: {'eval_loss': 0.4227490723133087, 'eval_runtime': 15.1952, 'eval_samples_per_second': 149.653, 'eval_steps_per_second': 18.756, 'epoch': 3.0}


In [14]:
# Predict on the test dataset
predictions = trainer.predict(tokenized_datasets["test"])
preds = predictions.predictions.argmax(axis=-1)

In [15]:
# Print confusion matrix
from sklearn.metrics import confusion_matrix
# Extract labels for the test set
y_test = dataset_dict["test"]['label']
cm = confusion_matrix(y_test, preds)
print(f"Confusion Matrix:\n{cm}")


Confusion Matrix:
[[1714  127]
 [  90  343]]


In [17]:
import numpy as np
# Predict on the test dataset
predictions = trainer.predict(tokenized_datasets["test"])
preds = predictions.predictions.argmax(axis=-1)

# Extract true labels
y_test = np.array(dataset_dict["test"]['label'])

# Calculate accuracy
accuracy = accuracy_score(y_test, preds)
print(f"Test Accuracy: {accuracy:.4f}")



Test Accuracy: 0.9046


In [18]:
# Save the model
model.save_pretrained('senti_model')
tokenizer.save_pretrained('senti_model_tokenization')


('senti_model_tokenization/tokenizer_config.json',
 'senti_model_tokenization/special_tokens_map.json',
 'senti_model_tokenization/vocab.txt',
 'senti_model_tokenization/added_tokens.json',
 'senti_model_tokenization/tokenizer.json')

In [20]:
import shutil

# Zip the model directory
shutil.make_archive('senti_model', 'zip', 'senti_model')
shutil.make_archive('senti_model_tokenization', 'zip', 'senti_model_tokenization')



'/content/senti_model_tokenization.zip'

In [21]:
from google.colab import files

# Download the zipped model file
files.download('senti_model.zip')

# Download the zipped tokenizer file
files.download('senti_model_tokenization.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>