In [3]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import csv

import nltk
# Uncomment this and run it once
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalAveragePooling1D, BatchNormalization
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.utils import to_categorical

In [27]:
pd.set_option('max_colwidth', 100)

# RNN LSTM Model For Class Prediction

In [None]:
# Intitializing WordNetLemmatizer and word corpus
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

## Cleaning Data

In [9]:
# Cleaning data by removing unwanted charaters usint RegEx
def dataCleaning(text):
    text = text.lower()
    text = re.sub(r'\\[ntr]', ' ', text)
    text = re.sub(r'https\S+|www\S+', '', text)
    text = text.replace('&', 'and')
    text = re.sub(r'\(ref:\s*\d+\)', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\+\-\*/=<>\%\^\(\)π]', '', text)

    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(tokens)

In [7]:
dataset = pd.read_csv('education_text_dataset_10000.csv')

In [8]:
dataset.head()

Unnamed: 0,text,subject
0,"Discuss how historical events influence literary themes, using war poetry as an example. ? Inclu...",English
1,The human heart has four chambers and pumps blood throughout the body. ; How do we use this in r...,Science
2,"Atoms bond to form molecules by sharing or transferring electrons. , Why is this important? (Ref...",Science
3,Explain the causes and effects of World War I. ? Why is this important? (Ref: 81),History
4,Atoms are the basic building blocks of matter. : Include key facts. (Ref: 32),Science


In [9]:
label_enco = LabelEncoder()

# Allpying Label Encoder to Encode the subject classes
dataset['label'] = label_enco.fit_transform(dataset['subject'])
dataset['clean_text'] = dataset['text'].apply(dataCleaning)

In [10]:
dataset.head(10)

Unnamed: 0,text,subject,label,clean_text
0,"Discuss how historical events influence literary themes, using war poetry as an example. ? Inclu...",English,0,discus historical event influence literary theme using war poetry example include key fact
1,The human heart has four chambers and pumps blood throughout the body. ; How do we use this in r...,Science,3,human heart four chamber pump blood throughout body use real life
2,"Atoms bond to form molecules by sharing or transferring electrons. , Why is this important? (Ref...",Science,3,atom bond form molecule sharing transferring electron important
3,Explain the causes and effects of World War I. ? Why is this important? (Ref: 81),History,1,explain cause effect world war important
4,Atoms are the basic building blocks of matter. : Include key facts. (Ref: 32),Science,3,atom basic building block matter include key fact
5,Shakespeare’s history plays often reflect the political issues of his time. ; Why is this import...,English,0,shakespeare history play often reflect political issue time important
6,"The area of a circle is given by πr^2. Calculate the area when r = 5. , Explain briefly. (Ref: 56)",Maths,2,area circle given πr^2 calculate area r = 5 explain briefly
7,"Discuss how historical events influence literary themes, using war poetry as an example. : Expla...",English,0,discus historical event influence literary theme using war poetry example explain briefly
8,Solve for x: 2x + 3 = 11. . Refer to the chart above. (Ref: 68),Maths,2,solve x 2x + 3 = 11 refer chart
9,The Industrial Revolution began in the late 18th century and transformed societies. ? How do we ...,History,1,industrial revolution began late 18th century transformed society use real life


In [11]:
text = dataset['clean_text'].values
labels = dataset['label'].values

## Spliting data into Train(80%), Test(10%) and Validation(10%)

In [13]:
X_train, X_temp, y_train, y_temp = train_test_split(text, labels, test_size=0.3, stratify=labels, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, shuffle=True, random_state=42)

In [14]:
print(len(X_train), len(X_val), len(X_test))

7000 1500 1500


## Tokenization and Padding

In [16]:
tokenizer = Tokenizer(oov_token="<oov>", lower = True)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

vocab_size = len(tokenizer.word_index) + 1

## Model Creation

In [19]:
embedding_dim = 64
lstm_units = 32
num_classes = 4

model = Sequential([
    Embedding(input_dim = vocab_size, output_dim=embedding_dim, input_length = 100),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [20]:
history = model.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=50,
    validation_data=(X_val_pad, y_val),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]
)

Epoch 1/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 102ms/step - accuracy: 0.7331 - loss: 0.7181 - val_accuracy: 1.0000 - val_loss: 7.8609e-05
Epoch 2/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 122ms/step - accuracy: 1.0000 - loss: 4.1205e-04 - val_accuracy: 1.0000 - val_loss: 1.2513e-05
Epoch 3/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 130ms/step - accuracy: 1.0000 - loss: 1.5628e-04 - val_accuracy: 1.0000 - val_loss: 3.9534e-06
Epoch 4/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 125ms/step - accuracy: 1.0000 - loss: 6.8530e-05 - val_accuracy: 1.0000 - val_loss: 1.2274e-06
Epoch 5/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 112ms/step - accuracy: 1.0000 - loss: 4.0139e-05 - val_accuracy: 1.0000 - val_loss: 7.4792e-07
Epoch 6/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 114ms/step - accuracy: 1.0000 - loss: 1.7773e-05 - val_accura

In [21]:
model.summary()
loss, acc = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {acc:.10f}")

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 1.0000 - loss: 1.8735e-09
Test Accuracy: 1.0000000000


## User Input Testing

In [100]:
user_input = "justify why civilisations around the world created monuments resulting in"

In [102]:
user_input_clean = dataCleaning(user_input)
seq = tokenizer.texts_to_sequences([user_input_clean])
padding = pad_sequences(seq, maxlen=100, padding='post')
print(padding.shape)

(1, 100)


In [104]:
pred = model.predict(padding)
predicted_class_index = np.argmax(pred)
label_map = {0: 'English', 1: 'History', 2: 'Maths', 3: 'Science'}
predicted_label = label_map[predicted_class_index]
print(f"Predicted Subject: {predicted_label}")
confidence = np.max(pred)
print(f"Confidence: {confidence:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Predicted Subject: Science
Confidence: 0.99


# AutoCompleate Model

In [11]:
# Cleaning Text data using RegEx
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()  # Convert to lowercase

    # Remove brackets only
    text = re.sub(r"[\[\]\(\)]", "", text)

    # Remove all characters except letters, numbers, spaces, and hyphens
    text = re.sub(r"[^a-z0-9\s\-]", "", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [15]:
def combine_cleaned_text(row):
    return f"{row['heading_clean']} {row['subheading_clean']} {row['sentence_clean']}".strip()

In [13]:
# Generate text function for user input
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]

        # Convert predicted index to word
        output_word = ''
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break

        seed_text += " " + output_word
    return seed_text

## Creating the CSV file

In [12]:
with open("word_prediction.txt", 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [8]:
data = []
current_heading = ""
current_subheading = ""
paragraph = ""

In [18]:
for line in lines:
    line = line.strip()
    if not line:
        continue  # Skip empty lines
    elif line.startswith("## "):
        current_heading = line.replace("## ", "").strip()
        current_subheading = ""
    elif line.startswith("### "):
        current_subheading = line.replace("### ", "").strip()
    else:
        # Treat as paragraph text
        sentences = sent_tokenize(line)
        for sentence in sentences:
            data.append([current_heading, current_subheading, sentence.strip()])

# Write to CSV
with open("history.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["heading", "subheading", "sentence"])
    writer.writerows(data)

## Cleaning the CSV file

In [19]:
sentenceComplete = pd.read_csv("history.csv")

In [21]:
sentenceComplete['sentence_clean'] = sentenceComplete['sentence'].apply(clean_text)
sentenceComplete['heading_clean'] = sentenceComplete['heading'].apply(clean_text)
sentenceComplete['subheading_clean'] = sentenceComplete['subheading'].apply(clean_text)

In [23]:
sentenceComplete.head(20)

Unnamed: 0,heading,subheading,sentence,sentence_clean,heading_clean,subheading_clean
0,Early Civilizations and the Indus Valley (3300...,Introduction,The Indus Valley Civilization is considered on...,the indus valley civilization is considered on...,early civilizations and the indus valley 3300 ...,introduction
1,Early Civilizations and the Indus Valley (3300...,Introduction,"Flourishing between 3300 BCE and 1300 BCE, it ...",flourishing between 3300 bce and 1300 bce it c...,early civilizations and the indus valley 3300 ...,introduction
2,Early Civilizations and the Indus Valley (3300...,Introduction,Often referred to as the Harappan Civilization...,often referred to as the harappan civilization...,early civilizations and the indus valley 3300 ...,introduction
3,Early Civilizations and the Indus Valley (3300...,Introduction,Spanning an area of approximately 1.25 million...,spanning an area of approximately 125 million ...,early civilizations and the indus valley 3300 ...,introduction
4,Early Civilizations and the Indus Valley (3300...,Geographic Spread and Major Cities,The civilization spread across the vast plains...,the civilization spread across the vast plains...,early civilizations and the indus valley 3300 ...,geographic spread and major cities
5,Early Civilizations and the Indus Valley (3300...,Geographic Spread and Major Cities,"Major urban centers such as Harappa, Mohenjo-D...",major urban centers such as harappa mohenjo-da...,early civilizations and the indus valley 3300 ...,geographic spread and major cities
6,Early Civilizations and the Indus Valley (3300...,Geographic Spread and Major Cities,These cities were strategically situated near ...,these cities were strategically situated near ...,early civilizations and the indus valley 3300 ...,geographic spread and major cities
7,Early Civilizations and the Indus Valley (3300...,Geographic Spread and Major Cities,The Indus and its tributaries created an ideal...,the indus and its tributaries created an ideal...,early civilizations and the indus valley 3300 ...,geographic spread and major cities
8,Early Civilizations and the Indus Valley (3300...,Geographic Spread and Major Cities,Harappa and Mohenjo-Daro are the two best-know...,harappa and mohenjo-daro are the two best-know...,early civilizations and the indus valley 3300 ...,geographic spread and major cities
9,Early Civilizations and the Indus Valley (3300...,Geographic Spread and Major Cities,"Both cities were meticulously planned, featuri...",both cities were meticulously planned featurin...,early civilizations and the indus valley 3300 ...,geographic spread and major cities


In [25]:
sentenceComplete['full_text'] = sentenceComplete.apply(combine_cleaned_text, axis=1)

## Tokenizer and Padding

In [28]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentenceComplete['full_text'])

In [30]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2427


In [32]:
text_sequence = tokenizer.texts_to_sequences(sentenceComplete['full_text'])

In [34]:
input_sequence = []

for seq in text_sequence:
    for i in range(1, len(seq)):
        n_gram_sequence = seq[:i+1]
        input_sequence.append(n_gram_sequence)

In [36]:
max_seq_len = max(len(x) for x in input_sequence)

input_sequences = pad_sequences(input_sequence, maxlen=max_seq_len, padding='pre')

In [38]:
input_sequences = np.array(input_sequences)
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [40]:
model = Sequential([
    Embedding(input_dim = vocab_size, output_dim=100, input_length = X.shape[1]),
    Bidirectional(LSTM(150, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(LSTM(100, return_sequences=False)),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
history = model.fit(X, y, epochs=30, verbose=1)

Epoch 1/30
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 229ms/step - accuracy: 0.1285 - loss: 5.6653
Epoch 2/30
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m291s[0m 233ms/step - accuracy: 0.3262 - loss: 4.1539
Epoch 3/30
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 221ms/step - accuracy: 0.3853 - loss: 3.6654
Epoch 4/30
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 223ms/step - accuracy: 0.4169 - loss: 3.3329
Epoch 5/30
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 219ms/step - accuracy: 0.4492 - loss: 3.0555
Epoch 6/30
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 220ms/step - accuracy: 0.4653 - loss: 2.8487
Epoch 7/30
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 243ms/step - accuracy: 0.4899 - loss: 2.6118
Epoch 8/30
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m574s[0m 460ms/step - accuracy: 0.5042 - loss:

In [132]:
seed = "Prime Minister of independent India, Jawaharlal Nehru, played a pivotal role during a period which was the most"
generated = generate_text(seed, next_words=60, model=model, tokenizer=tokenizer, max_sequence_len=max_seq_len)
print(generated)

Prime Minister of independent India, Jawaharlal Nehru, played a pivotal role during a period which was the most minister of islam in india during the first battle of plassey in 1757 ce where the british under robert clive defeated siraj ud daula the nawab of bengal and artistic achievement and would characterize the administration of delhi literature in the early 13th century in modern day islamic across and intellectual activity in india setting the stage for the establishment
