In [6]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

# physical_device = tf.config.list_physical_devices('GPU')
# if len(physical_device)>0:
#     tf.config.experimental.set_memory_growth(physical_devices[0], True)

dataset = tfds.load('ag_news_subset')

Downloading and preparing dataset 11.24 MiB (download: 11.24 MiB, generated: 35.79 MiB, total: 47.03 MiB) to /root/tensorflow_datasets/ag_news_subset/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/120000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ag_news_subset/1.0.0.incompleteI6VJXU/ag_news_subset-train.tfrecord*...:  …

Generating test examples...:   0%|          | 0/7600 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ag_news_subset/1.0.0.incompleteI6VJXU/ag_news_subset-test.tfrecord*...:   …

Dataset ag_news_subset downloaded and prepared to /root/tensorflow_datasets/ag_news_subset/1.0.0. Subsequent calls will reuse this data.


In [7]:
ds_train = dataset['train']
ds_test = dataset['test']

print(f"Length of train dataset = {len(ds_train)}")

print(f"Length of train dataset = {len(ds_test)}")

Length of train dataset = 120000
Length of train dataset = 7600


In [8]:
classes = ['World', 'Sports', 'Business', 'Sci/Tech']

for i,x in zip(range(5),ds_train):
    print(f"{x['label']} ({classes[x['label']]}) -> {x['title']} {x['description']}")

3 (Sci/Tech) -> b'AMD Debuts Dual-Core Opteron Processor' b'AMD #39;s new dual-core Opteron chip is designed mainly for corporate computing applications, including databases, Web services, and financial transactions.'
1 (Sports) -> b"Wood's Suspension Upheld (Reuters)" b'Reuters - Major League Baseball\\Monday announced a decision on the appeal filed by Chicago Cubs\\pitcher Kerry Wood regarding a suspension stemming from an\\incident earlier this season.'
2 (Business) -> b'Bush reform may have blue states seeing red' b'President Bush #39;s  quot;revenue-neutral quot; tax reform needs losers to balance its winners, and people claiming the federal deduction for state and local taxes may be in administration planners #39; sights, news reports say.'
3 (Sci/Tech) -> b"'Halt science decline in schools'" b'Britain will run out of leading scientists unless science education is improved, says Professor Colin Pillinger.'
1 (Sports) -> b'Gerrard leaves practice' b'London, England (Sports Network

##Limiting vocabulary size

In [11]:
vocab_size = 50000
vectorizer = keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocab_size)
vectorizer.adapt(ds_train.take(500).map(lambda x: x['title']+' '+ x['description']))

In [12]:
vocab = vectorizer.get_vocabulary()
vocab_size = len(vocab)
print(vocab[:10])
print(f"Length of vocabulary: {vocab_size}")

['', '[UNK]', 'the', 'to', 'a', 'in', 'of', 'and', 'on', 'for']
Length of vocabulary: 5335


In [13]:
vectorizer('I love to play with my words')

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 112, 3695,    3,  304,   11, 1041,    1])>

#Bag -of Words text representation

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
sc_vectorizer = CountVectorizer()

corpus = [
    'I Like hot dogs.',
    'The dog ran fast.',
    'Its hot outside.',
]

sc_vectorizer.fit_transform(corpus)
sc_vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray()

array([[1, 1, 0, 2, 0, 0, 0, 0, 0]])

In [16]:
def to_bow(text):
  return tf.reduce_sum(tf.one_hot(vectorizer(text), vocab_size), axis=0)

to_bow('My dog likes hot dogs on a hot day.').numpy()

array([0., 5., 0., ..., 0., 0., 0.], dtype=float32)

##Training the BoW classifier

In [17]:
batch_size = 128

ds_train_bow = ds_train.map(lambda x: (to_bow(x['title']+x['description']), x['label'])).batch(batch_size)
ds_test_bow = ds_train.map(lambda x: (to_bow(x['title']+x['description']), x['label'])).batch(batch_size)


In [18]:
model = keras.models.Sequential([
    keras.layers.Dense(4, activation='softmax', input_shape=(vocab_size,))
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['acc'])
model.fit(ds_train_bow,validation_data=ds_test_bow )



<keras.src.callbacks.History at 0x7cdf09d03400>

#Training a classifierr as one network

In [20]:
def extract_text(x):
  return x['title']+ ' '+ x['description']

def tupelize1(x):
    return (extract_text(x), x['label'])

inp = keras.Input(shape=(1,), dtype=tf.string)
x = vectorizer(inp)
x = tf.reduce_sum(tf.one_hot(x, vocab_size), axis=1)
out = keras.layers.Dense(4, activation='softmax')(x)
model = keras.models.Model(inp, out)
model.summary()

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['acc'])
model.fit(ds_train.map(tupelize1).batch(batch_size), validation_data=ds_test.map(tupelize1).batch(batch_size))

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_2 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 tf.one_hot_1 (TFOpLambda)   (None, None, 5335)        0         
                                                                 
 tf.math.reduce_sum_1 (TFOp  (None, 5335)              0         
 Lambda)                                                         
                                                                 
 dense_2 (Dense)             (None, 4)                 21344     
                                                                 
Total params: 21344 (83.38 KB)
Trainable params: 21344 (83.

<keras.src.callbacks.History at 0x7cdf08fd6950>

##Bigrams, trigrams and n-grams

In [21]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df=1)
corpus = [
    'I like hot dog.',
    'The dog ran fast.',
    'Its hot outside.',
]
bigram_vectorizer.fit_transform(corpus)
print("Vocabulary:\n", bigram_vectorizer.vocabulary_)
bigram_vectorizer.transform(['My dog likes hot dogs on a hot day']).toarray()

Vocabulary:
 {'i': 6, 'like': 10, 'hot': 3, 'dog': 0, 'i like': 7, 'like hot': 11, 'hot dog': 4, 'the': 15, 'ran': 13, 'fast': 2, 'the dog': 16, 'dog ran': 1, 'ran fast': 14, 'its': 8, 'outside': 12, 'its hot': 9, 'hot outside': 5}


array([[1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

##Automatically calculating BoW Vectors

In [24]:
model = keras.models.Sequential([
    keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocab_size, output_mode='count'),
    keras.layers.Dense(4, input_shape=(vocab_size,), activation='softmax')

])
print("Training vectorizer")
model.layers[0].adapt(ds_train.take(500).map(extract_text))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['acc'])
model.fit(ds_train.map(tupelize1).batch(batch_size), validation_data=ds_test.map(tupelize1).batch(batch_size))

Training vectorizer


<keras.src.callbacks.History at 0x7cdf08cc64d0>

##Term frequency

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit_transform(corpus)
vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray()

array([[0.4472136 , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [26]:
model = keras.models.Sequential([
    keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocab_size,output_mode='tf-idf'),
    keras.layers.Dense(4,input_shape=(vocab_size,), activation='softmax')
])
print("Training vectorizer")
model.layers[0].adapt(ds_train.take(500).map(extract_text))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['acc'])
model.fit(ds_train.map(tupelize1).batch(batch_size),validation_data=ds_test.map(tupelize1).batch(batch_size))

Training vectorizer


<keras.src.callbacks.History at 0x7cdf088ab1c0>