<a href="https://www.kaggle.com/code/esraanafaa/disaster-tweet-prediction-with-transfomers?scriptVersionId=127862449" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import transformers
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

  


In [3]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [4]:
train_df.shape

(7613, 5)

In [5]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [6]:
X_train = train_df.drop(columns=["keyword", "location", "target"])
y_train = train_df["target"]

In [7]:
X_train.shape

(7613, 2)

In [8]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train,y_train, test_size=0.2, random_state =30)

In [9]:
model_name="distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_validation, y_validation], axis=1)

In [11]:
train_data.head()

Unnamed: 0,id,text,target
434,630,Alleged East Bay serial arsonist arrested #San...,1
4696,6676,So cool @GarbanzoBean23 in the news! Cutest IN...,0
612,883,#anthrax #bioterrorism CDC To Carry Out Extens...,1
4705,6689,Listen to Landslide by Oh Wonder #SoundCloud h...,0
4830,6877,This Attempted Mass Murder brought to You by t...,0


In [12]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset=Dataset.from_pandas(val_data)
train_dataset = train_dataset.remove_columns(['__index_level_0__'])
val_dataset = val_dataset.remove_columns(['__index_level_0__'])

In [13]:
train_dataset

Dataset({
    features: ['id', 'text', 'target'],
    num_rows: 6090
})

In [14]:
dataset_dict = DatasetDict({"train": train_dataset, "val":val_dataset})

In [15]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'target'],
        num_rows: 6090
    })
    val: Dataset({
        features: ['id', 'text', 'target'],
        num_rows: 1523
    })
})

In [16]:
dataset_dict.set_format("pandas")

In [17]:
def tokenize_batch(batch):
    encodings = tokenizer(
    batch["text"].tolist(),
    padding=True,
    return_tensors="tf",
    )
    batch_dict = {key: value.tolist() for key, value in batch.items() if key in ['id','target']}
    encodings_dict = {key:value.numpy().tolist() for key, value in encodings.items()}
    print(batch_dict)
    print("***********************************")
    print(encodings_dict)
    return {**batch_dict, **encodings_dict}

In [18]:
data_encoded = dataset_dict.map(tokenize_batch, batched=True, batch_size =None)

  0%|          | 0/1 [00:00<?, ?ba/s]

{'id': [630, 6676, 883, 6689, 6877, 7247, 10113, 5699, 2566, 8374, 7135, 9841, 5467, 9960, 2975, 8131, 7226, 9906, 3850, 2175, 8620, 5870, 6043, 438, 4655, 3873, 5625, 2071, 5711, 550, 7988, 4798, 8638, 1411, 8176, 5335, 1780, 5544, 9998, 5375, 4864, 9292, 2159, 5695, 9007, 1237, 2157, 5929, 9249, 503, 6465, 5567, 6004, 7548, 9414, 4914, 1218, 7703, 10120, 3180, 6105, 10352, 5545, 1259, 8570, 9898, 285, 10329, 1242, 9553, 7457, 2399, 10318, 9983, 868, 8973, 74, 7288, 5502, 2896, 3446, 10647, 1422, 4735, 982, 6398, 2661, 3006, 7959, 1720, 2724, 2171, 8393, 2885, 4533, 4542, 4604, 3572, 8175, 6560, 795, 3530, 4499, 6477, 4915, 2832, 7410, 1307, 8444, 8221, 7542, 6945, 4274, 6454, 6340, 5622, 2414, 7978, 280, 6056, 4210, 8721, 856, 8742, 1532, 9044, 52, 4620, 9453, 8105, 1570, 7965, 4779, 4470, 267, 9667, 8936, 8635, 1803, 6474, 8906, 10049, 4688, 940, 1356, 5384, 8392, 600, 9766, 3144, 1485, 1969, 9087, 8480, 8464, 3259, 3743, 10101, 1618, 9022, 1029, 8071, 5626, 3610, 8610, 2142, 8510, 

  0%|          | 0/1 [00:00<?, ?ba/s]

{'id': [4318, 10306, 8900, 3289, 1716, 3430, 3124, 6130, 1002, 3756, 6963, 4398, 8747, 3661, 2609, 656, 3566, 8345, 4012, 2916, 3471, 7603, 4819, 3362, 6197, 959, 6015, 9636, 9293, 8727, 4788, 2711, 2629, 3947, 3337, 4320, 8005, 7475, 3637, 5248, 7277, 10677, 1491, 3519, 6572, 5484, 4534, 2091, 145, 4412, 2891, 1932, 1252, 477, 7408, 1666, 6695, 4266, 589, 1473, 490, 9702, 7491, 7228, 7560, 2839, 2128, 10744, 2367, 5171, 10832, 3857, 10492, 4671, 3848, 8791, 4755, 4778, 2022, 2024, 7360, 3091, 3417, 8683, 1230, 9704, 8235, 7571, 8205, 9901, 9812, 10372, 3936, 5119, 7120, 1168, 6022, 1071, 10197, 4602, 4926, 4401, 5944, 7654, 1480, 8883, 4219, 5949, 5989, 8401, 1369, 7356, 4553, 10239, 9374, 3157, 4440, 6303, 8526, 3123, 7590, 9912, 6407, 9972, 5754, 9861, 5196, 791, 2870, 10195, 102, 2590, 451, 4094, 5392, 6395, 10017, 8500, 6128, 4869, 10795, 6317, 2255, 4675, 6687, 468, 2457, 4975, 1841, 6699, 10513, 8547, 6773, 7106, 6950, 10283, 14, 3462, 6234, 5675, 3052, 8767, 4430, 6613, 1406, 7

In [19]:
data_encoded.set_format("tf")

In [20]:
class BertPretrainedForTweetClassification(tf.keras.Model):
    def __init__(self, model_name: str , num_classes: int):
        super().__init__()
        self.transformer = TFAutoModel.from_pretrained(model_name)
        self.dense = tf.keras.layers.Dense(512, activation="relu")
        self.dense2 = tf.keras.layers.Dense(64, activation="relu")
        self.dropout = tf.keras.layers.Dropout(0.4)
        self.output_layer = tf.keras.layers.Dense(num_classes, activation="sigmoid")
    
    def call(self,x):
        outputs = self.transformer(x)
        cls_last_hidden = outputs.last_hidden_state[:,0,:]
        x = self.dropout(cls_last_hidden)
        x = self.dense(x)
        x = self.dense2(x)
        x = self.output_layer(x)
        
        return x

In [21]:
def convert_to_tf_dataset(dataset):
    input_ids = dataset["input_ids"]
    attention_mask = dataset["attention_mask"]
    target = dataset["target"]
    return {"input_ids":input_ids, "attention_mask":attention_mask}, target

In [22]:
train_data_inputs = tf.data.Dataset.from_tensor_slices(convert_to_tf_dataset(data_encoded["train"]))
val_data_inputs = tf.data.Dataset.from_tensor_slices(convert_to_tf_dataset(data_encoded["val"]))

In [23]:
batch_size =32
buffer_size=500
train_data_inputs = train_data_inputs.shuffle(buffer_size).batch(batch_size)
val_data_inputs = val_data_inputs.batch(batch_size)

In [24]:
model = BertPretrainedForTweetClassification(model_name, 1)

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [25]:
early_stopping = EarlyStopping(
  monitor="val_loss",
  patience=2,
  verbose=1,
  restore_best_weights=True
)

class LearningRatePrinter(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        lr = self.model.optimizer.learning_rate.numpy()
        print(f"Learning rate for epoch {epoch + 1}: {lr}")
        
lr_printer = LearningRatePrinter()

In [26]:
lr_schedule = ExponentialDecay(1e-5, len(train_dataset), 1e-4)

In [27]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy"],
)

In [28]:
history = model.fit(train_data_inputs,
          epochs=10,
          validation_data=val_data_inputs,
          callbacks=[early_stopping, lr_printer])

Learning rate for epoch 1: 9.999999747378752e-06
Epoch 1/10
Learning rate for epoch 2: 7.502486369048711e-06
Epoch 2/10
Learning rate for epoch 3: 5.6202247833425645e-06
Epoch 3/10
Learning rate for epoch 4: 4.210194219922414e-06
Epoch 4/10
Epoch 4: early stopping


In [29]:
test_data = test_df[["id","text"]]
encoded_texts = tokenizer(
list(test_data["text"].values),
padding=True,
return_tensors="tf",
)

In [30]:
inputs = (
encoded_texts["input_ids"],
encoded_texts["attention_mask"],
)

In [31]:
pred = model.predict(inputs)



In [32]:
test_data["target"] = np.round(pred).astype(int)
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,id,text,target
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1


In [33]:
final_output = test_data[["id","target"]]
final_output.to_csv("submission.csv", index= False)