#Classificaton Preprocessing


In [None]:
# Data
classification_data = [
    ("The integral of x squared is x cubed over three plus a constant.", "Math"),
    ("Photosynthesis is the process by which green plants use sunlight to synthesize foods.", "Science"),
    ("The Magna Carta was signed in 1215.", "History"),
    ("E=mc^2 describes the relationship between energy and mass.", "Science"),
    ("Pythagoras' theorem states a^2 + b^2 = c^2.", "Math"),
    ("The French Revolution began in 1789.", "History"),
    ("Gravity is the force that attracts a body toward the center of the earth.", "Science"),
    ("Algebra involves solving for unknown variables.", "Math"),
    ("World War II ended in 1945.", "History"),
    ("Calculus deals with rates of change and accumulation.", "Math"),
    ("DNA carries the genetic instructions for all living organisms.", "Science"),
    ("The Roman Empire fell in 476 AD.", "History"),
    ("Trigonometry studies the relationships between angles and sides of triangles.", "Math"),
    ("The theory of evolution was proposed by Charles Darwin.", "Science"),
    ("The Renaissance was a period of great cultural and artistic change.", "History")
]

In [None]:
generation_text = """
The study of physics is fundamental to understanding the natural world. It encompasses a vast range of phenomena, from the smallest subatomic particles to the largest structures in the universe. Classical mechanics, developed by Isaac Newton, provides a framework for describing the motion of macroscopic objects under the influence of forces. Thermodynamics deals with heat and its relation to other forms of energy and work. Electromagnetism, unified by James Clerk Maxwell, explains the interaction of electric and magnetic fields and their connection to light. Quantum mechanics, a revolutionary theory, describes the behavior of matter and energy at the atomic and subatomic levels. Relativity, proposed by Albert Einstein, revolutionized our understanding of space, time, gravity, and the universe as a whole. These fundamental theories of physics have not only deepened our intellectual curiosity but have also led to countless technological advancements that have shaped modern society. Continued research in physics continues to push the boundaries of our knowledge and promises even more exciting discoveries in the future.
"""

In [None]:
#Importing Libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# Importing Data
classification_texts = [item[0] for item in classification_data]
classification_labels = [item[1] for item in classification_data]

In [None]:
# Tokenization
tokenizer_classification = Tokenizer(num_words=100)  # Consider top 100 words
tokenizer_classification.fit_on_texts(classification_texts)
word_index_classification = tokenizer_classification.word_index
vocab_size_classification = len(word_index_classification) + 1

In [None]:
# Sequencing
sequences_classification = tokenizer_classification.texts_to_sequences(classification_texts)

In [None]:
# Padding
max_length_classification = max(len(seq) for seq in sequences_classification)
padded_sequences_classification = pad_sequences(sequences_classification, maxlen=max_length_classification)


In [None]:
# Prepare labels for training (one-hot encode)
label_to_index = {"Math": 0, "Science": 1, "History": 2}
numerical_labels_classification = np.array([label_to_index[label] for label in classification_labels])
one_hot_labels_classification = to_categorical(numerical_labels_classification, num_classes=len(label_to_index))

In [None]:
# Split data into training and testing sets
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    padded_sequences_classification, one_hot_labels_classification, test_size=0.2, random_state=42
)

In [None]:
print("Classification Data Preprocessing Done.")
print("Vocabulary Size (Classification):", vocab_size_classification)
print("Shape of Training Data (Classification):", X_train_class.shape, y_train_class.shape)
print("Shape of Testing Data (Classification):", X_test_class.shape, y_test_class.shape)

Classification Data Preprocessing Done.
Vocabulary Size (Classification): 101
Shape of Training Data (Classification): (12, 14) (12, 3)
Shape of Testing Data (Classification): (3, 14) (3, 3)


#Generation Preprocessing

In [None]:
# Tokenization Generation
tokenizer_generation = Tokenizer()
tokenizer_generation.fit_on_texts([generation_text])
word_index_generation = tokenizer_generation.word_index
vocab_size_generation = len(word_index_generation) + 1

In [None]:
# Create sequences of words and the next word as the target
sequences_generation = []
for line in generation_text.split('.'):
    token_list = tokenizer_generation.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        sequences_generation.append(n_gram_sequence)

In [None]:
# Pad sequences
max_length_generation = max(len(seq) for seq in sequences_generation)
padded_sequences_generation = pad_sequences(sequences_generation, maxlen=max_length_generation)

In [None]:
# One-hot encode the target
# Extract target values (next word in the sequence)
y_generation = [seq[-1] for seq in sequences_generation]

# and then one-hot encode
y_generation = to_categorical(y_generation, num_classes=vocab_size_generation)

In [None]:
#Previous cell of code
# Pad sequences
max_length_generation = max(len(seq) for seq in sequences_generation)
padded_sequences_generation = pad_sequences(sequences_generation, maxlen=max_length_generation)

#Current cell of code
print("\nGeneration Data Preprocessing Done.")
print("Vocabulary Size (Generation):", vocab_size_generation)
#Assign padded_sequences_generation to X_generation
X_generation = padded_sequences_generation  # Assign padded sequences to X_generation
print("Shape of Input Sequences (Generation):", X_generation.shape)
print("Shape of Target (Generation):", y_generation.shape)


Generation Data Preprocessing Done.
Vocabulary Size (Generation): 115
Shape of Input Sequences (Generation): (157, 25)
Shape of Target (Generation): (157, 115)


#Build Classfication Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# --- Classification Model ---
embedding_dim_classification = 100
rnn_units_classification = 32
num_classes_classification = len(label_to_index)

model_classification = Sequential([
    Embedding(vocab_size_classification, embedding_dim_classification, input_length=max_length_classification),
    SimpleRNN(rnn_units_classification),
    Dense(num_classes_classification, activation='softmax')
])

model_classification.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_classification.summary()



#Build Generation Model

In [None]:
# --- Generation Model ---
embedding_dim_generation = 100
rnn_units_generation = 32

model_generation = Sequential([
    Embedding(vocab_size_generation, embedding_dim_generation, input_length=max_length_generation - 1),
    SimpleRNN(rnn_units_generation),
    Dense(vocab_size_generation, activation='softmax')
])

model_generation.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_generation.summary()

#Train Classification Model

In [None]:
epochs_classification = 20
batch_size_classification = 8

history_classification = model_classification.fit(
    X_train_class, y_train_class, epochs=epochs_classification, batch_size=batch_size_classification, validation_split=0.1, verbose=1
)

Epoch 1/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 415ms/step - accuracy: 0.2833 - loss: 1.1036 - val_accuracy: 0.5000 - val_loss: 1.1733
Epoch 2/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.6750 - loss: 0.9645 - val_accuracy: 0.0000e+00 - val_loss: 1.2163
Epoch 3/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step - accuracy: 0.8917 - loss: 0.8537 - val_accuracy: 0.0000e+00 - val_loss: 1.2525
Epoch 4/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.8917 - loss: 0.7661 - val_accuracy: 0.0000e+00 - val_loss: 1.2464
Epoch 5/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 1.0000 - loss: 0.6787 - val_accuracy: 0.0000e+00 - val_loss: 1.2443
Epoch 6/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 1.0000 - loss: 0.5969 - val_accuracy: 0.0000e+00 - val_loss: 1.2470
Epoch 7/20
[1m2/2[0m 

#Evaluate Classification Model

In [None]:
loss_classification, accuracy_classification = model_classification.evaluate(X_test_class, y_test_class, verbose=0)
print(f"\nClassification Test Accuracy: {accuracy_classification:.4f}")

# Demonstrate classification on a few unseen snippets
unseen_snippets = [
    "The derivative of sin(x) is cos(x).",
    "The sun is a star at the center of our solar system.",
    "Julius Caesar was assassinated in 44 BC."
]

unseen_sequences = tokenizer_classification.texts_to_sequences(unseen_snippets)
padded_unseen_sequences = pad_sequences(unseen_sequences, maxlen=max_length_classification)
predictions = model_classification.predict(padded_unseen_sequences)
predicted_labels = np.argmax(predictions, axis=1)
index_to_label = {v: k for k, v in label_to_index.items()}

print("\nClassification on Unseen Snippets:")
for i, snippet in enumerate(unseen_snippets):
    print(f"Snippet: {snippet} - Predicted: {index_to_label[predicted_labels[i]]}")


Classification Test Accuracy: 0.3333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step

Classification on Unseen Snippets:
Snippet: The derivative of sin(x) is cos(x). - Predicted: History
Snippet: The sun is a star at the center of our solar system. - Predicted: History
Snippet: Julius Caesar was assassinated in 44 BC. - Predicted: History


#Train Generation Model

In [None]:
epochs_generation = 100  # Train for more epochs for generation
batch_size_generation = 32

history_generation = model_generation.fit(
    X_generation, y_generation, epochs=epochs_generation, batch_size=batch_size_generation, verbose=1
)

Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.0345 - loss: 4.7433
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0725 - loss: 4.6454
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1220 - loss: 4.5767
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0998 - loss: 4.5577
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1043 - loss: 4.4755
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.1237 - loss: 4.4166
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.1407 - loss: 4.3467
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2092 - loss: 4.2914
Epoch 9/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

#Demonstrate Next Word Generation

In [None]:
def generate_next_words(model, tokenizer, seed_text, num_words):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_length_generation - 1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_index = np.argmax(predicted_probs)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

starting_sentence = "The fundamental principles of physics help us understand"
generated_text = generate_next_words(model_generation, tokenizer_generation, starting_sentence, 20)
print(f"\nNext Word Generation:")
print(f"Starting sentence: {starting_sentence}")
print(f"Generated text: {generated_text}")


Next Word Generation:
Starting sentence: The fundamental principles of physics help us understand
Generated text: The fundamental principles of physics help us understand physics physics physics physics physics physics physics physics physics physics physics physics physics physics physics physics physics physics physics physics
