In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


ModuleNotFoundError: No module named 'pandas'

In [None]:
train = pd.read_csv('input/ag-news-classification-dataset/train.csv')
test = pd.read_csv('input/ag-news-classification-dataset/test.csv')

In [3]:
train.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [4]:
train['Class Index'].value_counts()

Class Index
3    30000
4    30000
2    30000
1    30000
Name: count, dtype: int64

In [5]:
train.shape

(120000, 3)

In [6]:
test.head()

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [7]:
def count_words(text):
    # Split the article into words and count them
    return len(text.split())

# Step 3: Apply the function to create a new column with word counts
train['word_count'] = train['Description'].apply(count_words)

In [8]:
train['word_count'].mean()

31.060508333333335

In [9]:
def get_data(df):
    x = df['Description']
    y = df['Class Index']
    
    return x,y

In [10]:
X_train,y_train = get_data(train)
X_test,y_test = get_data(test)

In [11]:
# Adjust labels from 1-4 to 0-3
y_train = y_train - 1
y_test = y_test - 1

In [12]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [13]:
y_test

array([[0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [14]:
from transformers import AutoTokenizer,TFBertModel

In [15]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [16]:
max_len = 32

X_train = tokenizer(
    text = X_train.tolist(),
    max_length = max_len,
    return_tensors='tf',
    add_special_tokens = True,
    truncation = True,
    padding = True,
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

X_test = tokenizer(
    text = X_test.tolist(),
    max_length = max_len,
    return_tensors='tf',
    add_special_tokens = True,
    truncation = True,
    padding = True,
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

In [17]:
input_ids = X_train['input_ids']
attention_mask = X_train['attention_mask']

## Model Building

In [18]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense     

In [19]:
import tensorflow as tf
from tensorflow.keras import Model, Input, Layer
from transformers import TFBertModel
from tensorflow.keras.layers import Dense, Dropout, GlobalMaxPooling1D

# Define a custom layer to wrap the BERT model
class BertLayer(Layer):
    def __init__(self, **kwargs):
        super(BertLayer, self).__init__(**kwargs)
        self.bert_model = TFBertModel.from_pretrained("bert-base-uncased")

    def call(self, inputs):
        input_ids, attention_mask = inputs
        return self.bert_model(input_ids, attention_mask=attention_mask)[0]


# Input layers
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

# BERT Layer
embeddings = BertLayer()([input_ids, input_mask])

# Define additional layers
out = GlobalMaxPooling1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)
y = Dense(4, activation='softmax')(out)

# Create the model
model = Model(inputs=[input_ids, input_mask], outputs=y)

# Optionally set the BERT layer to be trainable or not
model.layers[2].trainable = True  # Set to False if you want to freeze BERT

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [20]:
from keras.optimizers import Adam
from keras.optimizers.schedules import ExponentialDecay
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy

# Define learning rate schedule
initial_learning_rate = 5e-05
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=10000,  # Adjust this value according to your needs
    decay_rate=0.01,    # Adjust this value according to your needs
    staircase=True)
# Define optimizer, loss, and metrics
optimizer = Adam(
    learning_rate=lr_schedule,
    epsilon=1e-08,
    clipnorm=1.0
)
loss = CategoricalCrossentropy(from_logits=True)
# Compile the model
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=['accuracy']
)

In [21]:
y_train

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [22]:
r = model.fit(
    x={'input_ids': X_train['input_ids'], 'attention_mask': X_train['attention_mask']},
    y=y_train,
    validation_data=(
        {'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},
        y_test
    ),
    epochs=2,
    batch_size=32
)

Epoch 1/2


  output, from_logits = _get_logits(
I0000 00:00:1728142852.667221      67 service.cc:145] XLA service 0x7f7e88002290 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728142852.667268      67 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
W0000 00:00:1728142853.057945      67 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_layer_1/tf_bert_model_1/bert/embeddings/assert_less/Assert/Assert


[1m   5/3750[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:01[0m 32ms/step - accuracy: 0.2963 - loss: 1.4622

I0000 00:00:1728142857.520787      67 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3749/3750[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 31ms/step - accuracy: 0.7058 - loss: 0.7853

W0000 00:00:1728142976.227342      67 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_layer_1/tf_bert_model_1/bert/embeddings/assert_less/Assert/Assert
W0000 00:00:1728142985.184934      68 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_layer_1/tf_bert_model_1/bert/embeddings/assert_less/Assert/Assert


[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 34ms/step - accuracy: 0.7058 - loss: 0.7852 - val_accuracy: 0.8654 - val_loss: 0.3915
Epoch 2/2
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 33ms/step - accuracy: 0.8635 - loss: 0.3896 - val_accuracy: 0.8767 - val_loss: 0.3599


In [23]:
encoded_dict = {
    0: 'World',
    1: 'Sports',
    2: 'Business',
    3: 'Sci/Tech'
}

In [24]:
def predict(text):
    # Tokenize the input text with the correct max_length
    x_val = tokenizer(
        text=text,
        add_special_tokens=True,
        max_length=32,  # Adjust this to match your model's expected input size
        truncation=True,
        padding='max_length',
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )

    # Get predictions from the model
    validation = model.predict({
        'input_ids': x_val['input_ids'],
        'attention_mask': x_val['attention_mask']
    }) * 100  # Scale scores to percentage

    # Get the predicted label with the highest score
    predicted_index = validation[0].argmax()  # Get the index of the max score
    predicted_label = encoded_dict[predicted_index]  # Get the corresponding label (add 1 for zero-based index)
    predicted_score = validation[0][predicted_index]  # Get the score for that label

    # Return only the predicted label and its score
    return predicted_label, predicted_score


In [25]:
# Step 4: Create a test input (news line)
news_line = "The global economy is recovering as countries start easing restrictions, boosting business confidence and encouraging travel."

# Step 5: Make the prediction
predicted_label, predicted_score = predict(news_line)

# Step 6: Display the results
print(f"Predicted Label: {predicted_label} with score: {predicted_score:.2f}%")

W0000 00:00:1728143114.320419      66 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_layer_1/tf_bert_model_1/bert/embeddings/assert_less/Assert/Assert


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
Predicted Label: Business with score: 87.03%


In [26]:
# Step 4: Create a test input (news line related to science)
news_line = "Scientists have discovered a new method for converting carbon dioxide into useful fuels, which could significantly reduce greenhouse gas emissions and combat climate change."

# Step 5: Make the prediction
predicted_label, predicted_score = predict(news_line)

# Step 6: Display the results
print(f"Predicted Label: {predicted_label} with score: {predicted_score:.2f}%")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Predicted Label: Sci/Tech with score: 93.22%


In [None]:
# Save the model
model.save('ag_news_model.keras')

In [None]:
# Save the model
model.save('ag_news_model.h5')

In [None]:
# Load the saved model
loaded_model = tf.keras.models.load_model('ag_news_model.keras')

# Example function to clean new text data
def clean_new_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z0-9]',' ',text)
    text = text.split()
    text = [ i for i in text if i not in stopwords.words('english')]
    text = [stem.stem(i) for i in text]
    return ' '.join(text)

# Function to prepare new text data for prediction
def prepare_new_text(text):
    onehot_repr = one_hot(text, voc_size)
    return pad_sequences([onehot_repr], padding='post', maxlen=sent_len)

# Example usage: Predicting a new text
new_text = "This is a sample news article about technology."
cleaned_text = clean_new_text(new_text)
prepared_text = prepare_new_text(cleaned_text)

# Make prediction
prediction = loaded_model.predict(prepared_text)

# Convert prediction to class labels
class_labels = ['World', 'Sports', 'Business', 'Sci/Tech']
predicted_class_index = np.argmax(prediction[0])
predicted_class = class_labels[predicted_class_index]

print(f"Predicted Class: {predicted_class}")

In [None]:
# Load the saved model
loaded_model = tf.keras.models.load_model('ag_news_model.keras')

# Example function to clean new text data
def clean_new_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z0-9]',' ',text)
    text = text.split()
    text = [ i for i in text if i not in stopwords.words('english')]
    text = [stem.stem(i) for i in text]
    return ' '.join(text)

# Function to prepare new text data for prediction
def prepare_new_text(text):
    onehot_repr = one_hot(text, voc_size)
    return pad_sequences([onehot_repr], padding='post', maxlen=sent_len)

# Example usage: Predicting a new text
new_text = "Explain me about it"
cleaned_text = clean_new_text(new_text)
prepared_text = prepare_new_text(cleaned_text)

# Make prediction
prediction = loaded_model.predict(prepared_text)

# Convert prediction to class labels
class_labels = ['World', 'Sports', 'Business', 'Sci/Tech']
predicted_class_index = np.argmax(prediction[0])
predicted_class = class_labels[predicted_class_index]

print(f"Predicted Class: {predicted_class}")