<a href="https://colab.research.google.com/github/Aiolos19/Colab-ML/blob/main/Natural_Language_Processing_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nlp-disaster:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4995674%2F8397204%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240515%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240515T035957Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D52791f952749700d572d515055f8dd270497a67b767d3732f2b027bdebf86459259de7d3a64ee774d52895beda62d68dc89072638eab4fe33d14db653212a58e967ae0d86db37f20bc575c930ca2a0e55982b189a058718b5f0be160d253054d1db8f72f62432792d79b31c2382d6d245f2d2592b92841c6124c0b33b1f6306f565702a2e04ca4857f691c6de80eff4a3ef048773f21fc8bf69b6c6186a9f4c6d27314788dd2d5f4e7e29b3e670f3d677c73e7984a002ac27d712197ece7ebbd6b00e4c599c09c1046ee27ea36b058704378ced46a2f85f1256b99a7ff4bf9fa690275f99f67fbf661518940b0d792202f30a50d38aa5ea5cc9f3ad54aa35abf'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading nlp-disaster, 607343 bytes compressed
Downloaded and uncompressed: nlp-disaster
Data source import complete.


In [2]:
import tensorflow as tf

tf.keras.backend.clear_session()

In [3]:
import zipfile
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
train = pd.read_csv('/kaggle/input/nlp-disaster/train.csv')
test = pd.read_csv('/kaggle/input/nlp-disaster/test.csv')

* id - a unique identifier for each tweet
* text - the text of the tweet
* location - the location the tweet was sent from (may be blank)
* keyword - a particular keyword from the tweet (may be blank)
* target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)

In [4]:
train.sample(5)

Unnamed: 0,id,keyword,location,text,target
4230,6007,hazardous,United States,MEG issues Hazardous Weather Outlook (HWO) ht...,1
7081,10142,upheaval,maryland,A Look at State Actions a Year after Ferguson...,1
4438,6315,hostage,"ÌÏT: 40.562796,-75.488849",Murfreesboro peeps- I'm hearing Walmart on S R...,1
3346,4790,evacuated,Manchester,Trafford Centre film fans angry after Odeon ci...,1
334,481,armageddon,"Castaic, CA",Armageddon averted by El Patron\n#UltimaLucha,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
test.sample(5)

Unnamed: 0,id,keyword,location,text
1946,6563,injury,Las Vegas,New post: Cowboys believe Lance Dunbar&amp;#03...
135,428,apocalypse,,Also my other nephew is proof that fat babies ...
113,376,annihilation,,@jackienatalydlt I do.... I only get the iced ...
1950,6581,injury,,Ben Heenan carted off the field at @Colts trai...
1362,4484,electrocuted,mullingar ireland,@hairdryer180 you'll get electrocuted


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [8]:
train[train["target"] == 0]["text"].values[2]

'Summer is lovely'

In [9]:
train[train["target"] == 1]["text"].values[2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [10]:
# Preprocessing
def preprocess_text(text):
# bisa tambahkan teks processing data
    return text

train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)

In [11]:
# Split data set
X_train, X_val, y_train, y_val = train_test_split(
    train['text'], train['target'], test_size=0.2, random_state=42)

In [12]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
# Tokenizing the datasets
def tokenize_data(text_list):
    return tokenizer(
        text_list.tolist(), padding=True, truncation=True, return_tensors="pt")

train_encodings = tokenize_data(X_train)
val_encodings = tokenize_data(X_val)
test_encodings = tokenize_data(test['text'])

In [14]:
# Convert to torch tensors
train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)

In [15]:
# Create PyTorch dataset
class DisasterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = DisasterDataset(train_encodings, train_labels)
val_dataset = DisasterDataset(val_encodings, val_labels)
test_dataset = DisasterDataset(test_encodings)

In [16]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
import transformers
import accelerate
print(transformers.__version__)
print(accelerate.__version__)

4.40.2
0.30.1


In [18]:
# Set up Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [19]:
# Train the model
trainer.train()

# Evaluate the model
val_predictions = trainer.predict(val_dataset)
val_preds = np.argmax(val_predictions.predictions, axis=-1)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Step,Training Loss,Validation Loss
10,0.7233,0.704169
20,0.6893,0.698383
30,0.696,0.689965
40,0.6864,0.6757
50,0.6748,0.665119
60,0.6489,0.650916
70,0.6387,0.630499
80,0.5973,0.605467
90,0.5891,0.562897
100,0.6564,0.534468


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Validation Accuracy: 0.8214051214707814
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       874
           1       0.79      0.79      0.79       649

    accuracy                           0.82      1523
   macro avg       0.82      0.82      0.82      1523
weighted avg       0.82      0.82      0.82      1523



In [20]:
# Predict on test data
test_predictions = trainer.predict(test_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=-1)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [21]:
# Create submission file
submission = pd.DataFrame({'id': test['id'], 'target': test_preds})
submission.to_csv('NLP Disaster.csv', index=False)