# Binary Classification with Tensorflow Subclass Model

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, GRU, Dense
from sklearn.metrics import f1_score
import regex as re

In [3]:
np.random.seed(99)

In [4]:
sentences = [
    "The Battle of Waterloo was fought in 1815.",
    "Isaac Newton formulated the laws of motion.",
    "The Renaissance period marked a significant cultural revival.",
    "Einstein's theory of relativity revolutionized modern physics.",
    "The French Revolution began in 1789.",
    "Photosynthesis is the process by which plants convert light energy into chemical energy.",
    "The Industrial Revolution transformed society with technological advancements.",
    "Plate tectonics explain the movement of Earth's lithosphere.",
    "Ancient Egyptians built the pyramids as tombs for their pharaohs.",
    "DNA is the genetic material that carries hereditary information in living organisms.",
    "World War II ended in 1945 with the defeat of Nazi Germany.",
    "Charles Darwin proposed the theory of evolution by natural selection.",
    "The invention of the printing press by Gutenberg revolutionized communication.",
    "The Big Bang theory describes the origin of the universe.",
    "The Great Wall of China was built over several centuries.",
    "Atoms are the basic building blocks of matter.",
    "The Black Death pandemic swept through Europe in the 14th century.",
    "Albert Einstein's famous equation is E=mc^2.",
    "The Declaration of Independence was signed in 1776.",
    "The theory of continental drift was proposed by Alfred Wegener.",
    "The ancient Greeks made significant contributions to mathematics and philosophy.",
    "Newton's law of universal gravitation explains the attraction between objects.",
    "The Roman Empire fell in 476 AD.",
    "Chemical reactions involve the breaking and forming of chemical bonds.",
    "The assassination of Archduke Franz Ferdinand triggered World War I.",
    "Cells are the basic structural and functional units of all living organisms.",
    "The Silk Road facilitated trade between Europe and Asia.",
    "The discovery of penicillin by Alexander Fleming revolutionized medicine.",
    "The American Civil War took place from 1861 to 1865.",
    "The theory of special relativity explains the relationship between space and time.",
    "The Magna Carta established the principle of limited government.",
    "Gravity is the force that attracts objects toward each other.",
    "The Maya civilization flourished in Mesoamerica.",
    "The periodic table organizes elements based on their atomic number and properties.",
    "The Cold War was a period of geopolitical tension between the United States and the Soviet Union.",
    "Photosynthesis is vital for the production of oxygen on Earth.",
    "The invention of the telescope revolutionized astronomy.",
    "The Spanish Inquisition was established in the late 15th century.",
    "Genetics is the study of heredity and variation in living organisms.",
    "The Mongol Empire was the largest contiguous land empire in history.",
    "Newton's laws of motion describe the behavior of objects in motion.",
    "The Great Depression was a severe worldwide economic downturn in the 1930s.",
    "The human brain is composed of billions of neurons.",
    "The Trojan War is described in Homer's epic poem, the Iliad.",
    "The discovery of fire revolutionized early human societies.",
    "The theory of natural selection is a key mechanism of evolution.",
    "The Treaty of Versailles ended World War I.",
    "The theory of electromagnetism describes the relationship between electricity and magnetism.",
    "The Mayflower Compact established self-government in Plymouth Colony.",
    "The discovery of DNA's structure by Watson and Crick laid the foundation for molecular biology."
]

labels = ["history", "science", "history", "science", "history",
          "science", "history", "science", "history", "science",
          "history", "science", "history", "science", "history",
          "science", "history", "science", "history", "science",
          "history", "science", "history", "science", "history",
          "science", "history", "science", "history", "science",
          "history", "science", "history", "science", "history",
          "science", "history", "science", "history", "science",
          "history", "science", "history", "science", "history",
          "science", "history", "science", "history", "science"]


In [5]:
data_dict = {'Sentences': sentences, 'Subject': labels}

In [6]:
data = pd.DataFrame(data_dict)
data.head(), data.shape, data.dtypes

(                                           Sentences  Subject
 0         The Battle of Waterloo was fought in 1815.  history
 1        Isaac Newton formulated the laws of motion.  science
 2  The Renaissance period marked a significant cu...  history
 3  Einstein's theory of relativity revolutionized...  science
 4               The French Revolution began in 1789.  history,
 (50, 2),
 Sentences    object
 Subject      object
 dtype: object)

In [7]:
# Randomize Data

df = data.sample(frac=1, ignore_index=True)
df.head()

Unnamed: 0,Sentences,Subject
0,The Black Death pandemic swept through Europe ...,history
1,The Maya civilization flourished in Mesoamerica.,history
2,The Big Bang theory describes the origin of th...,science
3,Charles Darwin proposed the theory of evolutio...,science
4,The American Civil War took place from 1861 to...,history


In [8]:
# Split Data into training and testing - 80, 20,

# train_X = df.iloc[:40,0]
# train_y = df.iloc[:40,1]
# test_X = df.iloc[40:,0]
# test_y = df.iloc[40:,1]
# train_X.shape, train_y.shape, test_X.shape, test_y.shape

In [10]:
# Preprocess data - tokenizing

sentence_tokens = df['Sentences'].apply(lambda t: re.findall(r"[\w']+", t.lower()))
sentence_tokens.head()

0    [the, black, death, pandemic, swept, through, ...
1    [the, maya, civilization, flourished, in, meso...
2    [the, big, bang, theory, describes, the, origi...
3    [charles, darwin, proposed, the, theory, of, e...
4    [the, american, civil, war, took, place, from,...
Name: Sentences, dtype: object

In [11]:
# Determine the longest token in the sequence series
max_token_length = max(sentence_tokens.apply(lambda x: len(x)))
max_token_length

17

In [12]:
# Determine how many unique values there are in Categories column (we already know it is two, but this is to show just in case the count of unique values are unknown).

unique_categories = len(df['Subject'].unique())
unique_categories

2

In [13]:
# To create the vocab first extract all the unique words into a single list
unique_tokens = set()

for sentence in sentence_tokens:
    for word in sentence:

        unique_tokens.add(word)

In [14]:
# Length of vocab       
vocab = len(unique_tokens)
vocab

279

In [15]:
df['Subject'].unique()[1]

'science'

In [16]:
# Next, create the ids to tokens and tokens to ids (known as the vocab)

# Also, be sure to increment the id by n amount to account for special tokens.
# In this case, the only additional token is 0 which will indicate padding (<PAD>)

id_to_token = {k:v.lower() for k,v in enumerate(unique_tokens)}
# id_to_token[vocab+unique_categories] = "<PAD>"
# id_to_token[unique_categories-2] = df['Subject'].unique()[0]
# id_to_token[unique_categories-1] = df['Subject'].unique()[1]


token_to_id = {v.lower():k for k,v in enumerate(unique_tokens)}
# token_to_id["<PAD>"] = vocab+unique_categories
# token_to_id[df['Subject'].unique()[0]] = unique_categories-2
# token_to_id[df['Subject'].unique()[1]] = unique_categories-1


token_to_id

{'poem': 0,
 'ii': 1,
 'involve': 2,
 'behavior': 3,
 'organisms': 4,
 'electricity': 5,
 'evolution': 6,
 'universal': 7,
 'earth': 8,
 'inquisition': 9,
 'forming': 10,
 'self': 11,
 'equation': 12,
 'crick': 13,
 'information': 14,
 'mechanism': 15,
 'bonds': 16,
 'ended': 17,
 'flourished': 18,
 'dna': 19,
 'land': 20,
 'that': 21,
 'contiguous': 22,
 'plants': 23,
 'production': 24,
 'between': 25,
 'built': 26,
 'building': 27,
 'neurons': 28,
 'penicillin': 29,
 'special': 30,
 'cold': 31,
 'plymouth': 32,
 'trade': 33,
 'cells': 34,
 'relationship': 35,
 'industrial': 36,
 'structure': 37,
 '1930s': 38,
 'is': 39,
 'world': 40,
 'are': 41,
 'alfred': 42,
 'telescope': 43,
 'study': 44,
 'transformed': 45,
 'mathematics': 46,
 'gravitation': 47,
 'maya': 48,
 'e': 49,
 'ancient': 50,
 'principle': 51,
 'elements': 52,
 'based': 53,
 'tension': 54,
 'triggered': 55,
 'light': 56,
 'atoms': 57,
 'fire': 58,
 'colony': 59,
 '476': 60,
 'revival': 61,
 'energy': 62,
 'revolution': 6

In [17]:
# Convert each token in every sequence to token ids

all_sentence_ids = []

for sentence in sentence_tokens:
    per_sentence_ids = []
    
    for word in sentence:
        per_sentence_ids.append(token_to_id[word])
        
    if len(per_sentence_ids) < max_token_length:
        
        # Preprocess data - padding and/or truncation
        per_sentence_ids.extend(np.zeros(max_token_length - len(sentence), dtype=float))

    all_sentence_ids.append(per_sentence_ids)


In [18]:
# For the features create the input ids (where each index represents a token based on the vocab) into a tensor with the padding included

input_ids = np.array(all_sentence_ids, dtype=int)
input_ids

array([[267, 116, 214, 105, 206, 103, 200, 246, 267,  77,  95,   0,   0,
          0,   0,   0,   0],
       [267,  48, 248,  18, 246, 114,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [267,  97, 260, 222, 244, 267, 127, 119, 267,  83,   0,   0,   0,
          0,   0,   0,   0],
       [180, 221, 240, 267, 222, 119,   6,  68, 185, 234,   0,   0,   0,
          0,   0,   0,   0],
       [267, 166, 204, 274, 179, 264, 153, 228, 195, 207,   0,   0,   0,
          0,   0,   0,   0],
       [267, 235,  93,  71, 246,  60,  74,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [208,  39, 267,  44, 119,  90, 278, 161, 246, 197,   4,   0,   0,
          0,   0,   0,   0],
       [267, 254, 119, 267,  43, 124, 171,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [245, 273, 119,   7,  47, 201, 267, 223,  25, 165,   0,   0,   0,
          0,   0,   0,   0],
       [267, 187, 119,  58, 124, 139, 198, 175,   0,   0,   0,   0,   0,
         

In [19]:
# Determine the unique classes
unique_subject_names = df['Subject'].unique().tolist()
unique_subject_names

['history', 'science']

In [20]:
# Convert tokens to ids for each label
# label_ids = df['Subject'].apply(lambda e: 0 if e == 'history' else 1 )
# label_ids

all_label_ids = []
for subject in df['Subject']:
    all_label_ids.append(unique_subject_names.index(subject))

# label_ids = torch.Tensor(all_label_ids)   
label_ids = np.array(all_label_ids, dtype=int)   

In [21]:
label_ids

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1])

In [22]:
# Split data into training and testing

train_X = input_ids[:40,:]
train_y = label_ids[:40]
test_X = input_ids[40:,:]
test_y = label_ids[40:]
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((40, 17), (40,), (10, 17), (10,))

In [23]:
# Create model

class SubjectClassifier(tf.keras.Model):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int):
        super().__init__(self)

        self.embedding_layer = Embedding(vocab_size, embedding_dim)
        self.lstm_layer = LSTM(hidden_dim)
        self.dense_layer = Dense(1, activation="sigmoid")


    def __call__(self, inputs, training=None):
        layer_1 = self.embedding_layer(inputs)
        layer_2 = self.lstm_layer(layer_1)
        layer_3 = self.dense_layer(layer_2)
        return layer_3
    

model = SubjectClassifier(vocab, 100, 256)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [24]:
# Train model

model.fit(train_X, train_y, epochs=5, batch_size=4, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x299d7b070>

In [25]:
test_features = df.iloc[40:,0]
test_labels = df.iloc[40:,1]

test_features, test_labels


(40    Chemical reactions involve the breaking and fo...
 41    The discovery of DNA's structure by Watson and...
 42    Photosynthesis is the process by which plants ...
 43                 The French Revolution began in 1789.
 44    The Declaration of Independence was signed in ...
 45    The theory of electromagnetism describes the r...
 46    DNA is the genetic material that carries hered...
 47    Newton's laws of motion describe the behavior ...
 48    Photosynthesis is vital for the production of ...
 49          Isaac Newton formulated the laws of motion.
 Name: Sentences, dtype: object,
 40    science
 41    science
 42    science
 43    history
 44    history
 45    science
 46    science
 47    history
 48    science
 49    science
 Name: Subject, dtype: object)

In [26]:
# Make predictions

predictions = model.predict(test_X)
predictions



array([[9.73724544e-01],
       [6.08532727e-01],
       [9.63613451e-01],
       [1.11129855e-04],
       [6.50132424e-04],
       [9.84253705e-01],
       [1.70536339e-04],
       [9.83704388e-01],
       [1.40580887e-04],
       [9.82097447e-01]], dtype=float32)

In [27]:
# Compute the discrete label based on the logits
predictions_round = (predictions > .5).astype(int)
predictions_round

array([[1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1]])

In [28]:
for label, feature, pred in zip(test_labels, test_features, predictions_round):
    print('Actual Label: ', label, '  Prediction: ', unique_subject_names[pred[0]], '  Test Sequence: ', feature)
    

Actual Label:  science   Prediction:  science   Test Sequence:  Chemical reactions involve the breaking and forming of chemical bonds.
Actual Label:  science   Prediction:  science   Test Sequence:  The discovery of DNA's structure by Watson and Crick laid the foundation for molecular biology.
Actual Label:  science   Prediction:  science   Test Sequence:  Photosynthesis is the process by which plants convert light energy into chemical energy.
Actual Label:  history   Prediction:  history   Test Sequence:  The French Revolution began in 1789.
Actual Label:  history   Prediction:  history   Test Sequence:  The Declaration of Independence was signed in 1776.
Actual Label:  science   Prediction:  science   Test Sequence:  The theory of electromagnetism describes the relationship between electricity and magnetism.
Actual Label:  science   Prediction:  history   Test Sequence:  DNA is the genetic material that carries hereditary information in living organisms.
Actual Label:  history   Pred

In [30]:
# Compute F1 Score
print('F1 Score: ', f1_score(predictions_round, test_y))

F1 Score:  0.7692307692307692
