In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import transformers
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

  


In [3]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [4]:
train_df.shape

(7613, 5)

In [5]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [6]:
X_train = train_df.drop(columns=["keyword", "location", "target"])
y_train = train_df["target"]

In [7]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train,y_train, test_size=0.2, random_state =30)

In [8]:
model_name="distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_validation, y_validation], axis=1)

In [10]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset=Dataset.from_pandas(val_data)
train_dataset = train_dataset.remove_columns(['__index_level_0__'])
val_dataset = val_dataset.remove_columns(['__index_level_0__'])

In [11]:
dataset_dict = DatasetDict({"train": train_dataset, "val":val_dataset})

In [12]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'target'],
        num_rows: 6090
    })
    val: Dataset({
        features: ['id', 'text', 'target'],
        num_rows: 1523
    })
})

In [13]:
dataset_dict.set_format("pandas")

In [14]:
def tokenize_batch(batch):
    encodings = tokenizer(
    batch["text"].tolist(),
    padding=True,
    return_tensors="tf",
    )
    batch_dict = {key: value.tolist() for key, value in batch.items() if key in ['id','target']}
    encodings_dict = {key:value.numpy().tolist() for key, value in encodings.items()}
    return {**batch_dict, **encodings_dict}

In [15]:
data_encoded = dataset_dict.map(tokenize_batch, batched=True, batch_size =None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
data_encoded.set_format("tf")

In [17]:
class BertPretrainedForTweetClassification(tf.keras.Model):
    def __init__(self, model_name: str , num_classes: int):
        super().__init__()
        self.transformer = TFAutoModel.from_pretrained(model_name)
        self.dense = tf.keras.layers.Dense(512, activation="relu")
        self.dense2 = tf.keras.layers.Dense(32, activation="relu")
        self.dropout = tf.keras.layers.Dropout(0.4)
        self.output_layer = tf.keras.layers.Dense(num_classes, activation="sigmoid")
    
    def call(self,x):
        outputs = self.transformer(x)
        cls_last_hidden = outputs.last_hidden_state[:,0,:]
        x = self.dropout(cls_last_hidden)
        x = self.dense(x)
        x = self.dense2(x)
        x = self.output_layer(x)
        
        return x

In [18]:
def convert_to_tf_dataset(dataset):
    input_ids = dataset["input_ids"]
    attention_mask = dataset["attention_mask"]
    target = dataset["target"]
    return {"input_ids":input_ids, "attention_mask":attention_mask}, target

In [19]:
train_data_inputs = tf.data.Dataset.from_tensor_slices(convert_to_tf_dataset(data_encoded["train"]))
val_data_inputs = tf.data.Dataset.from_tensor_slices(convert_to_tf_dataset(data_encoded["val"]))

In [20]:
batch_size =32
buffer_size=500
train_data_inputs = train_data_inputs.shuffle(buffer_size).batch(batch_size)
val_data_inputs = val_data_inputs.batch(batch_size)

In [21]:
model = BertPretrainedForTweetClassification(model_name, 1)

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [22]:
early_stopping = EarlyStopping(
  monitor="val_loss",
  patience=2,
  verbose=1,
  restore_best_weights=True
)

class LearningRatePrinter(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        lr = self.model.optimizer.learning_rate.numpy()
        print(f"Learning rate for epoch {epoch + 1}: {lr}")
        
lr_printer = LearningRatePrinter()

In [23]:
lr_schedule = ExponentialDecay(1e-5, len(train_dataset), 1e-4)

In [24]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy"],
)

In [25]:
history = model.fit(train_data_inputs,
          epochs=10,
          validation_data=val_data_inputs,
          callbacks=[early_stopping, lr_printer])

Learning rate for epoch 1: 9.999999747378752e-06
Epoch 1/10
Learning rate for epoch 2: 7.502486369048711e-06
Epoch 2/10
Learning rate for epoch 3: 5.6202247833425645e-06
Epoch 3/10
Learning rate for epoch 4: 4.210194219922414e-06
Epoch 4/10
Epoch 4: early stopping


In [26]:
test_data = test_df[["id","text"]]
encoded_texts = tokenizer(
list(test_data["text"].values),
padding=True,
return_tensors="tf",
)

In [27]:
inputs = (
encoded_texts["input_ids"],
encoded_texts["attention_mask"],
)

In [28]:
pred = model.predict(inputs)



In [29]:
test_data["target"] = np.round(pred).astype(int)
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,id,text,target
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1


In [30]:
final_output = test_data[["id","target"]]
final_output.to_csv("submission.csv", index= False)