In [None]:
pip install transformers pandas numpy scikit-learn


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset
train_df = pd.read_csv('ag_news_train.csv', header=None)
test_df = pd.read_csv('ag_news_test.csv', header=None)

# Preprocess the text
stop_words = set(stopwords.words('english'))
train_df[1] = train_df[1].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.lower() not in stop_words and word not in string.punctuation]))
test_df[1] = test_df[1].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.lower() not in stop_words and word not in string.punctuation]))

# Convert the labels to integers
train_df[0] = train_df[0] - 1
test_df[0] = test_df[0] - 1

# Convert the data to numpy arrays
train_data = train_df[1].values
train_labels = train_df[0].values
test_data = test_df[1].values
test_labels = test_df[0].values


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(train_data.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data.tolist(), truncation=True, padding=True)

# Convert the data to tensorflow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

# Load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Fine-tune the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)


In [None]:
# Evaluate the model
y_pred = np.argmax(model.predict(test_dataset.batch(16)).logits, axis=1)
accuracy = accuracy_score(test_labels, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)


In [None]:
# Make predictions
test_text = ["Scientists have discovered a new planet that could support life.", 
             "The stock market is booming as companies report record profits.", 
             "The World Cup soccer tournament will be held in Qatar next year."]
test_encodings = tokenizer(test_text, truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings)))
y_pred = np.argmax(model.predict(test_dataset.batch(1)).logits, axis=1)

# Print the predictions
for text, label in zip(test_text, y_pred):
    print(f'{text} => {label}')
