In [None]:
%pip install keras


import random
import tensorflow as tf
import numpy as np
import os
from transformers import set_seed


np.random.seed(42)
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
set_seed(42)

os.environ['TF_DETERMINISTIC_OPS'] = '1'



import pandas as pd
import keras

# Load the training data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')



###########################################


import re # Regular Expression

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text) # Apply the data cleaning process to training data
test_data['clean_text'] = test_data['text'].apply(clean_text)# Apply the data cleaning process to testing data



###########################################



from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=64,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )

train_encodings = tokenize_texts(train_data['clean_text'])
test_encodings = tokenize_texts(test_data['clean_text'])



###########################################

import tensorflow as tf

train_labels = tf.convert_to_tensor(train_data['target'].values)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# Create a validation split
val_size = int(0.2 * len(train_data))
val_dataset = train_dataset.take(val_size)
train_dataset = train_dataset.skip(val_size)

# Batch and shuffle the datasets
batch_size = 128

train_dataset = train_dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)



###########################################

from transformers import TFBertForSequenceClassification, BertConfig

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-8),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
)

history = model.fit(
    train_dataset,
    epochs=1,
    validation_data=val_dataset
)



###########################################


test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings)
)).batch(32)

predictions = model.predict(test_dataset).logits
predicted_labels = tf.argmax(predictions, axis=1).numpy()

# Create a submission DataFrame
submission = pd.DataFrame({'id': test_data['id'], 'target': predicted_labels})
submission.to_csv('submission.csv', index=False)


# predictions = model.predict(test_dataset).logits
# predictions = tf.nn.softmax(predictions, axis=1)
# predicted_labels = tf.argmax(predictions, axis=1).numpy()


# ###########################################


# submission = pd.DataFrame({'id': test_data['id'], 'target': predicted_labels})
# submission.to_csv('submission.csv', index=False)