 #### <p style="background-color: #EDE7F6;color:#6600ff;display: inline-block;padding:.6rem;border-radius:.5rem">Import libraries</p>

In [1]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [2]:
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

 #### <p style="background-color: #EDE7F6;color:#6600ff;display: inline-block;padding:.6rem;border-radius:.5rem">Settings</p>

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 1000
warnings.filterwarnings("ignore", message="`Model.state_updates`", category=UserWarning)
warnings.filterwarnings("ignore", message="`layer.updates`", category=UserWarning)

 #### <p style="background-color: #EDE7F6;color:#6600ff;display: inline-block;padding:.6rem;border-radius:.5rem">Import data files</p>

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test Set Shape = (3263, 4)
Test Set Memory Usage = 0.10 MB


### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">I.DATA PREPARATION</p>

#### <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">A.EDA ( Exploratory Data Analysis)</p>

In [None]:
df_train.head()

In [4]:
df_train["length"] = df_train["text"].apply(lambda x : len(x))
df_test["length"] = df_test["text"].apply(lambda x : len(x))

print("Train Length Stat")
print(df_train["length"].describe())
print()

print("Test Length Stat")
print(df_test["length"].describe())

Train Length Stat
count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: length, dtype: float64

Test Length Stat
count    3263.000000
mean      102.108183
std        33.972158
min         5.000000
25%        78.000000
50%       109.000000
75%       134.000000
max       151.000000
Name: length, dtype: float64


In [5]:
df_train.shape[0]

7613

## <p style="background-color: #EDE7F6; color: #6600ff;margin:0; display:inline-block;padding:.6rem;border-radius:.25rem;">MODEL ENGINEERING</p>

In [None]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE

EPOCHS = 2
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
from sklearn.model_selection import train_test_split

X = df_train["text"]
y = df_train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

X_test = df_test["text"]

In [None]:
import os
import sys

# # Redirect stdout and stderr to /dev/null
# sys.stdout = open(os.devnull, 'w')
# sys.stderr = open(os.devnull, 'w')

# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)
classifier.summary()

In [None]:
from keras.optimizers import Adam

# Compile
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=Adam(learning_rate=1e-5),
    metrics=["accuracy"]
)

# Fit
history = classifier.fit(
    x=X_train,
    y=y_train,
    batch_size=BATCH_SIZE, 
    epochs=EPOCHS, 
    validation_data=(X_val, y_val)
)

In [None]:
classifier.evaluate(X_val, y_val)

In [None]:
from sklearn.metrics import f1_score
train_true = y_train.copy()
train_pred = classifier.predict(X_train)
train_pred = np.argmax(train_pred, axis=1) 

val_true = y_val.copy()
val_pred = classifier.predict(X_val)
val_pred = np.argmax(val_pred, axis=1) 

train_f1_score = f1_score(train_true, train_pred)
val_f1_score = f1_score(val_true, val_pred)

print("F1-score metric on train data == {}".format(train_f1_score))
print("F1-score metric on validation dataset == {}".format(val_f1_score))

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.head()

In [None]:
sample_submission["target"] = np.argmax(classifier.predict(X_test), axis=1)
sample_submission.head()

In [None]:
sample_submission.target.value_counts().plot(kind='barh')
plt.show()

In [None]:
sample_submission.to_csv("submission.csv", index=False)
print("Your solution has been submitted")
print("Submission size: {} KB".format(sample_submission.memory_usage().sum()/1024))