In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



#########################################################################################

import random
import tensorflow as tf
import numpy as np
import os
from transformers import set_seed



np.random.seed(42)
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
set_seed(42)

os.environ['TF_DETERMINISTIC_OPS'] = '1'
#########################################################################################

import pandas as pd

# Load the training data
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Display the first few rows of the training data
print(train_data.head())

#########################################################################################
import re # Regular Expression

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text) # Apply the data cleaning process to training data
test_data['clean_text'] = test_data['text'].apply(clean_text)# Apply the data cleaning process to testing data

# Display the first few rows of the cleaned data
print(train_data[['text', 'clean_text']].head())


#########################################################################################
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=64,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )

train_encodings = tokenize_texts(train_data['clean_text'])
test_encodings = tokenize_texts(test_data['clean_text'])


#########################################################################################

# import matplotlib.pyplot as plt

# # Tokenize the clean text without padding to get the length of each tweet
# train_data['token_length'] = train_data['clean_text'].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))
# test_data['token_length'] = test_data['clean_text'].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))

# # Plot the distribution of token lengths
# plt.hist(train_data['token_length'], bins=50, alpha=0.7, label='Train')
# plt.hist(test_data['token_length'], bins=50, alpha=0.7, label='Test')
# plt.axvline(x=128, color='r', linestyle='--', label='MAX_LEN = 128')
# plt.xlabel('Token Length')
# plt.ylabel('Frequency')
# plt.legend()
# plt.show()

# # Display some statistics
# print("Train token length statistics:")
# print(train_data['token_length'].describe())

# print("\nTest token length statistics:")
# print(test_data['token_length'].describe())


#########################################################################################
import tensorflow as tf

train_labels = tf.convert_to_tensor(train_data['target'].values)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# Create a validation split
val_size = int(0.2 * len(train_data))
val_dataset = train_dataset.take(val_size)
train_dataset = train_dataset.skip(val_size)

# Batch and shuffle the datasets
batch_size = 32

train_dataset = train_dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)


#########################################################################################

from transformers import TFBertForSequenceClassification, BertConfig

strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
  # Model configuration and creation
  config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
  model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

  # Model compilation
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-8),
      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
  )

  # Model training
  history = model.fit(
      train_dataset,
      epochs=3,
      validation_data=val_dataset
  )


#########################################################################################
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings)
)).batch(32)

predictions = model.predict(test_dataset).logits
predicted_labels = tf.argmax(predictions, axis=1).numpy()

#########################################################################################

# Create a submission DataFrame
submission = pd.DataFrame({'id': test_data['id'], 'target': predicted_labels})
submission.to_csv('submission_2_kaggle.csv', index=False)

#########################################################################################
