In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

In [None]:
train_data.head()

In [None]:
train_data['text'].duplicated().sum()

In [None]:
train_data.drop_duplicates('text', keep='first')
train_data.shape

In [None]:
MAX_LEN = 30

In [None]:
from transformers import BertTokenizer
from transformers import AutoTokenizer
import tensorflow as tf

In [None]:

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        t = t.replace("#","")
        new_text.append(t.lower())
    return " ".join(new_text).strip().replace("  ", " ")

In [None]:
train_data['text'] = train_data['text'].apply(preprocess)
train_data.head()

In [None]:
train_encoded_inputs = tokenizer(train_data['text'].tolist(),
                                 add_special_tokens = True,
                                 padding='max_length', 
                                 truncation=True, 
                                 max_length=MAX_LEN, 
                                 return_token_type_ids=False,
                                 return_tensors = 'tf')


In [None]:
train_encoded_inputs


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_encoded_inputs, train_data['target'].values))

In [None]:
def map_roberta(inputs, labels):
    inputs = {'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask']}
  
    return inputs, labels

In [None]:
train_dataset = train_dataset.map(map_roberta)

In [None]:
dataset = train_dataset.shuffle(100000).batch(64)

In [None]:
datalen = len(dataset)

In [None]:
SPLIT = 0.9
train_ds = dataset.take(round(datalen*SPLIT))
val_ds = dataset.skip(round(datalen*SPLIT))

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
roberta_model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
roberta_model.save_pretrained(MODEL)

In [None]:
input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=np.int32, name='input_ids' )
mask = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=np.int32, name = 'attention_mask')

x = roberta_model(input_ids, attention_mask=mask)
x= x[0]

x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation = 'relu')(x)
x = tf.keras.layers.Dense(32, activation = 'relu')(x)

# output layer
y = tf.keras.layers.Dense(1, activation='sigmoid')(x)

# create the model
final_model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

In [None]:
final_model.summary()

In [None]:
final_model.layers[2].trainable = False
final_model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-4)
loss = tf.keras.losses.BinaryCrossentropy()
metric = tf.keras.metrics.BinaryAccuracy('accuracy')

final_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.1,min_delta=0.001,monitor='val_loss'),
             tf.keras.callbacks.EarlyStopping(patience=5, min_delta=0.001, monitor='val_loss')]

history = final_model.fit(
    train_ds,
    validation_data = val_ds,
    epochs = 3,
    callbacks = callbacks
)

In [None]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
test_df['text'] = test_df['text'].apply(preprocess)

In [None]:
test_encoded_inputs = tokenizer(test_df['text'].tolist(),
                                 add_special_tokens = True,
                                 padding='max_length', 
                                 truncation=True, 
                                 max_length=MAX_LEN, 
                                 return_token_type_ids=False,
                                 return_tensors = 'tf')

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_encoded_inputs))

In [None]:
test_ds = test_dataset.shuffle(100000).batch(64)
test_pred = final_model.predict(test_ds)

In [None]:
test_target = np.round(test_pred).flatten()

In [None]:
test_target

In [None]:
output = pd.DataFrame({'id': test_df['id'], 'Survived': test_target})
output.to_csv('submission.csv', index=False)