In [1]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import Dense
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
df = pd.read_csv('Annotations.csv')

df.columns = df.columns.str.strip()

print(df.head())

texts = df['Section Content'].tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

vocab_size = len(tokenizer.word_index) + 1
max_seq_len = max(len(text.split()) for text in texts)

df = df.dropna(subset=['IMRAD Section'])
df = df.drop(columns=['Title'])

df = pd.get_dummies(df, columns=['IMRAD Section', 'Subsection', 'Sub subsection'], prefix=['IMRAD', 'Subsection', 'Subsubsection'])

   ID    Title            IMRAD Section  IMRAD Section ID  \
0   1  Seekers             INTRODUCTION                 1   
1   1  Seekers      RESEARCH OBJECTIVES                 2   
2   1  Seekers  LITERATURE OF THE STUDY                 3   
3   1  Seekers              METHODOLOGY                 4   
4   1  Seekers              METHODOLOGY                 4   

                Subsection  Subsection ID Sub subsection Sub subsection ID  \
0                      NaN              0            NaN                 0   
1                      NaN              0            NaN                 0   
2                      NaN              0            NaN                 0   
3          Research Design             17            NaN                 0   
4  Data Collection Methods              4     Local Data                 9   

                                     Section Content  
0  As people move into the new century, the issue...  
1  This study designed, developed, tested, and ev...  

In [3]:
print(df.columns)

y_IMRAD = df.filter(regex='^IMRAD_(?!Section ID)')
y_Subsection = df.filter(regex='^Subsection_(?!Subsection ID)')
y_Subsubsection = df.filter(regex='^Subsubsection_(?!Subsubsection ID)')

Index(['ID', 'IMRAD Section ID', 'Subsection ID', 'Sub subsection ID',
       'Section Content', 'IMRAD_CONCLUSION', 'IMRAD_CONCLUSION ',
       'IMRAD_INTRODUCTION', 'IMRAD_LITERATURE OF THE STUDY',
       'IMRAD_METHODOLOGY',
       ...
       'Subsubsection_Testing', 'Subsubsection_Text Classification:',
       'Subsubsection_Text Recognition', 'Subsubsection_The Design',
       'Subsubsection_The Design ', 'Subsubsection_The Development ',
       'Subsubsection_The Development of the Web and Mobile Application ',
       'Subsubsection_To Do List', 'Subsubsection_Train Recognizer Model',
       'Subsubsection_User Interface '],
      dtype='object', length=130)


In [4]:
def hierarchical_attention_model(max_seq_len, vocab_size, embedding_dim, task):
   input_layer = keras.layers.Input(shape=(max_seq_len,))
   embeddings = keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)

   lstm_1 = keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True))(embeddings)
   attention_1 = keras.layers.Attention()([lstm_1, lstm_1])
   pooled_1 = keras.layers.GlobalAveragePooling1D()(attention_1)
   
   lstm_2 = keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True))(attention_1)
   attention_2 = keras.layers.Attention()([lstm_2, lstm_2])
   pooled_2 = keras.layers.GlobalAveragePooling1D()(attention_2)

   lstm_3 = keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True))(attention_2)
   attention_3 = keras.layers.Attention()([lstm_3, lstm_3])
   pooled_3 = keras.layers.GlobalAveragePooling1D()(attention_3)

   if task == 'IMRAD':
       output = Dense(len(df.filter(like='IMRAD_').columns), activation='softmax')(pooled_1)
       model = keras.models.Model(inputs=input_layer, outputs=output)
       
   elif task == 'Subsection':
       output_Subsection = Dense(len(df.filter(like='Subsection_').columns), activation='softmax')(pooled_2)
       model = keras.models.Model(inputs=input_layer, outputs=output_Subsection)
       
   elif task == 'Subsubsection':
       output_Subsubsection = Dense(len(df.filter(like='Subsubsection').columns), activation='softmax')(pooled_3)
       model = keras.models.Model(inputs=input_layer, outputs=output_Subsubsection)
       
   elif task == 'all':
       output_IMRAD = Dense(len(df.filter(like='IMRAD').columns), activation='softmax')(pooled_1)
       output_Subsection = Dense(len(df.filter(like='Subsection').columns), activation='softmax')(pooled_2)
       output_Subsubsection = Dense(len(df.filter(like='Subsubsection').columns), activation='softmax')(pooled_3)

       model = keras.models.Model(inputs=input_layer, outputs=[output_IMRAD, output_Subsection, output_Subsubsection])

   return model


In [9]:
vocab_size = 8971
max_seq_len = 2073

embedding_dim = min(50, (vocab_size // 2))

model = hierarchical_attention_model(max_seq_len, vocab_size, embedding_dim, 'all')

X = df['Section Content']
y_IMRAD = df.filter(regex='^IMRAD_(?!Section ID)', axis=1)
y_Subsection = df.filter(regex='^Subsection_(?!Subsection ID)', axis=1)
y_Subsubsection = df.filter(regex='^Subsubsection_(?!Subsubsection ID)', axis=1)

X = tokenizer.texts_to_sequences(X)

X = pad_sequences(X, maxlen=max_seq_len)

In [10]:
df.columns = df.columns.str.strip()
y_IMRAD = y_IMRAD.astype(int)
y_Subsection = y_Subsection.astype(int)
y_Subsubsection = y_Subsubsection.astype(int)


In [11]:
model_IMRAD = hierarchical_attention_model(max_seq_len, vocab_size, embedding_dim, 'IMRAD')
model_Subsection = hierarchical_attention_model(max_seq_len, vocab_size, embedding_dim, 'Subsection')
model_Subsubsection = hierarchical_attention_model(max_seq_len, vocab_size, embedding_dim, 'Subsubsection')

model_IMRAD.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_Subsection.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_Subsubsection.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 1
batch_size = 32

model_IMRAD.fit(X, y_IMRAD, epochs=epochs, batch_size=batch_size)

model_Subsection.fit(X, y_Subsection, epochs=epochs, batch_size=batch_size)

model_Subsubsection.fit(X, y_Subsubsection, epochs=epochs, batch_size=batch_size)



<keras.src.callbacks.History at 0x19f29377b90>

In [14]:
# Save the entire model
model.save('new model.h5')



  saving_api.save_model(


In [13]:
print(y_Subsection.nunique())
print(df.filter(like='Subsection_').columns)


Subsection_Algorithm Analysis                       2
Subsection_Applied Concepts and Techniques          2
Subsection_Conceptualization                        2
Subsection_Data Collection Methods                  2
Subsection_Data Model Generation                    2
Subsection_Delete Stop Words                        2
Subsection_Distribution of Questionnaire            2
Subsection_Evaluation Phase                         2
Subsection_Evaluation Tool                          2
Subsection_Feature Extraction                       2
Subsection_Lemmatization                            2
Subsection_Locale of the Study                      2
Subsection_Locale of the Study                      2
Subsection_Lowercasing                              2
Subsection_NONE                                     2
Subsection_Numeric and Special Character Removal    2
Subsection_Overall Record of Actual Testing         2
Subsection_Population of the Study                  2
Subsection_Population of the