In [9]:
!pip install scipy
!pip install tensorflow



In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss, coverage_error, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import MultiLabelBinarizer

In [11]:
# Load the dataset
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

X_train_dense = X_train_tfidf.toarray()
X_val_dense = X_val_tfidf.toarray()

In [13]:
# Compute class weights for multi-label data
def compute_multi_label_class_weights(y):
    n_samples = len(y)
    n_classes = y.shape[1]

    class_counts = np.sum(y, axis=0)
    class_weights = n_samples / (n_classes * class_counts)

    return class_weights

# Compute class weights
class_weights = compute_multi_label_class_weights(y_train)
# Create a dictionary of class weights
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [14]:
# Define the neural network model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='sigmoid'))  # Sigmoid activation for multi-label classification

model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Binary crossentropy for multi-label classification
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               1280128   
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 6)                 390       
                                                                 
Total params: 1288774 (4.92 MB)
Trainable params: 1288774 (4.92 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
# Train the model with class weights
model.fit(X_train_dense, y_train, epochs=10, batch_size=32, validation_data=(X_val_dense, y_val), class_weight=class_weight_dict)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d6c32f8c9a0>

In [16]:
# Evaluate the model
y_pred_val = model.predict(X_val_dense)
y_pred_val_binary = np.where(y_pred_val >= 0.5, 1, 0)

accuracy = accuracy_score(y_val, y_pred_val_binary)
f1 = f1_score(y_val, y_pred_val_binary, average='weighted')
precision = precision_score(y_val, y_pred_val_binary, average='weighted')
recall = recall_score(y_val, y_pred_val_binary, average='weighted')
hamming = hamming_loss(y_val, y_pred_val_binary)
coverage = coverage_error(y_val, y_pred_val_binary)
tn, fp, fn, tp = confusion_matrix(y_val.ravel(), y_pred_val_binary.ravel()).ravel()
g_mean = np.sqrt((tp / (tp + fn)) * (tn / (tn + fp)))

print("Accuracy: {:.4f}".format(accuracy))
print("F1 Score: {:.4f}".format(f1))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Hamming Loss: {:.4f}".format(hamming))
print("Coverage: {:.4f}".format(coverage))
print("G-Mean: {:.4f}".format(g_mean))


Accuracy: 0.6572
F1 Score: 0.8010
Precision: 0.8308
Recall: 0.7736
Hamming Loss: 0.0808
Coverage: 2.4684
G-Mean: 0.8609
