## Children's Book Age Group Prediction  
*  The dataset used is [Highly Rated Children Books And Stories](https://www.kaggle.com/datasets/thomaskonstantin/highly-rated-children-books-and-stories),uploaded by Thomas Konstantin on Kaggle.
* Given *the names and descriptions of highly-rated children's books*, we would predict the **age group** for a given book.  
  


In [2]:
# IMPORTING LIBRARIES
import numpy as np
import pandas as pd
import plotly.express as px

import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [5]:
df = pd.read_csv('../input/highly-rated-children-books-and-stories/children_stories.Csv', encoding='latin-1')

In [6]:
df

In [9]:
df['cats'].value_counts().sort_index()# CREATING LABELS

In [10]:
sorted(list(df['cats'].unique()))

In [11]:
df = df.drop(data.query("cats == 'Age '").index, axis=0).reset_index(drop=True)

In [12]:
young_ages = [
    'Age 6months+',
    'Age  0-3',
    'Age 0+',
    'Age 0-2',
    'Age 0-3',
    'Age 0-4',
    'Age 0-5',
    'Age 0-6',
    'Age 1+',
    'Age 1-2',
    'Age 1-3',
    'Age 1-4',
    'Age 1-5',
    'Age 1-6',
    'Age 2+',
    'Age 2-4',
    'Age 2-5',
    'Age 2-6',
    'Age 2-7',
    'Age 2-9',
    'Age 3+',
    'Age 3-4',
    'Age 3-5',
    'Age 3-6',
    'Age 3-7',
    'Age 4+',
    'Age 4-11',
    'Age 4-5',
    'Age 4-6',
    'Age 4-7',
    'Age 4-8'
]

In [13]:
df['cats'] = df['cats'].apply(lambda age: 0 if age in young_ages else 1)

In [14]:
df['cats'].value_counts() / len(df['cats'])

In [15]:
df

In [16]:
# DATA PROCESSING
def process_text(text):
    
    # Remove digits
    text = re.sub(r'\d+', ' ', text)
    
    # Split on whitespace
    text = text.split()
    
    # Join on whitespace, but only the words that are not stop words
    text = ' '.join([word for word in text if word not in stopwords.words('english')])
    
    return text

In [17]:
names = df['names'].copy().apply(process_text)
descriptions = df['desc'].copy().apply(process_text)

labels = df['cats'].copy()

In [18]:
names

In [19]:
descriptions

In [20]:
labels

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([names, descriptions]))

names = tokenizer.texts_to_sequences(names)
descriptions = tokenizer.texts_to_sequences(descriptions)

In [22]:
names[0:5]

In [23]:
vocab_length = len(tokenizer.word_index) + 1

print("Vocabulary length:", vocab_length)

In [24]:
max_name_length = np.max(list(map(lambda name: len(name), names)))
max_desc_length = np.max(list(map(lambda desc: len(desc), descriptions)))

print("Max name length:", max_name_length)
print("Max description length:", max_desc_length)

In [25]:
names = pad_sequences(names, maxlen=max_name_length, padding='post')
descriptions = pad_sequences(descriptions, maxlen=max_desc_length, padding='post')

In [26]:
print("Shape:", names.shape)
names

In [27]:
print("Shape:", descriptions.shape)
descriptions

In [28]:
# TRAIN TEST SPLIT
names_train, names_test, descriptions_train, descriptions_test, labels_train, labels_test = train_test_split(names, descriptions, labels, train_size=0.7, random_state=100)

In [29]:
# MODELLING 
name_dim = 64

name_input = tf.keras.Input(shape=(max_name_length,), name="name_input")

name_embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=name_dim,
    input_length=max_name_length,
    name="name_embedding"
)(name_input)

name_flatten = tf.keras.layers.Flatten(name="name_flatten")(name_embedding)

In [30]:
desc_dim = 64

desc_input = tf.keras.Input(shape=(max_desc_length,), name="desc_input")

desc_embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=desc_dim,
    input_length=max_desc_length,
    name="desc_embedding"
)(desc_input)

gru_layer = tf.keras.layers.GRU(
    units=256,
    return_sequences=True,
    name="gru_layer"
)(desc_embedding)

desc_flatten = tf.keras.layers.Flatten(name="desc_flatten")(gru_layer)

In [31]:
concat = tf.keras.layers.concatenate([name_flatten, desc_flatten], name="concatenate")

output = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(concat)

In [32]:
model = tf.keras.Model(inputs=[name_input, desc_input], outputs=output)

print(model.summary())
tf.keras.utils.plot_model(model)

In [34]:
# TRAINING
batch_size = 32
epochs = 14

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

history = model.fit(
    [names_train, descriptions_train],
    labels_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau()
    ]
)

In [35]:
fig = px.line(
    history.history,
    y=['loss', 'val_loss'],
    labels={'x': "epoch", 'y': "loss"},
    title="Loss Over Time"
)

fig.show()

In [36]:
fig = px.line(
    history.history,
    y=['accuracy', 'val_accuracy'],
    labels={'x': "epoch", 'y': "accuracy"},
    title="Accuracy Over Time"
)

fig.show()

In [37]:
results = model.evaluate([names_test, descriptions_test], labels_test)

print("Accuracy:", results[1])
print(" ROC AUC:", results[2])