In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss, coverage_error, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from gensim.models import KeyedVectors
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MultiLabelBinarizer

In [12]:
# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/DeepikaPanneer/Multi-Label-Predictions-on-Academic-Articles/main/data/train.csv')
data['TEXT'] = data['TITLE'] + ' ' + data['ABSTRACT']

X = data['TEXT']
y = data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

maxlen = 200
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen)
X_val_padded = pad_sequences(X_val_seq, maxlen=maxlen)

# Compute class weights for multi-label data
def compute_multi_label_class_weights(y):
    n_samples = len(y)
    n_classes = y.shape[1]

    class_counts = np.sum(y, axis=0)
    class_weights = n_samples / (n_classes * class_counts)

    return class_weights

# Compute class weights
class_weights = compute_multi_label_class_weights(y_train)


In [13]:
# Load pre-trained Word2Vec embeddings
word2vec_path = 'https://drive.google.com/file/d/1CBfeuD2OeFynUwLnIS1ft7ZiqLC48B5K/view?usp=drive_link'




In [14]:
!pip install gensim
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
%cd /content/drive/My Drive/Ranjitha/
word2vec_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)


/content/drive/My Drive/Ranjitha


In [15]:
from tensorflow.keras.initializers import Constant
# Prepare embedding matrix
embedding_dim = 300
num_words = min(max_words, len(tokenizer.word_index)) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i > max_words:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]



# Define the neural network model
model = Sequential()
model.add(Embedding(num_words, embedding_dim, embeddings_initializer=Constant(embedding_matrix),
                    trainable=False))  # Freeze the embedding layer
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(6, activation='sigmoid'))  # Sigmoid activation for multi-label classification

model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Binary crossentropy for multi-label classification
              metrics=['accuracy'])
model.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 300)         3000300   
                                                                 
 lstm_1 (LSTM)               (None, 128)               219648    
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 6)                 390       
                                                                 
Total params: 3228594 (12.32 MB)
Trainable params: 228294 (891.77 KB)
Non-trainable params: 3000300 (11.45 MB)
_________________________________________________________________


In [16]:
# Create a dictionary of class weights
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_val_padded, y_val),class_weight=class_weight_dict)




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7dad38294a00>

In [17]:
# Evaluate the model
y_pred_val = model.predict(X_val_padded)
y_pred_val_binary = np.where(y_pred_val >= 0.5, 1, 0)




In [18]:
accuracy = accuracy_score(y_val, y_pred_val_binary)
f1 = f1_score(y_val, y_pred_val_binary, average='weighted')
precision = precision_score(y_val, y_pred_val_binary, average='weighted')
recall = recall_score(y_val, y_pred_val_binary, average='weighted')
hamming = hamming_loss(y_val, y_pred_val_binary)
coverage = coverage_error(y_val, y_pred_val_binary)
tn, fp, fn, tp = confusion_matrix(y_val.ravel(), y_pred_val_binary.ravel()).ravel()
g_mean = np.sqrt((tp / (tp + fn)) * (tn / (tn + fp)))

print("Accuracy: {:.4f}".format(accuracy))
print("F1 Score: {:.4f}".format(f1))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Hamming Loss: {:.4f}".format(hamming))
print("Coverage: {:.4f}".format(coverage))
print("G-Mean: {:.4f}".format(g_mean))

Accuracy: 0.5802
F1 Score: 0.7185
Precision: 0.7927
Recall: 0.6655
Hamming Loss: 0.1066
Coverage: 2.9385
G-Mean: 0.7969
