In [1]:


from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
! pip install kaggle



In [3]:
! mkdir ~/.kaggle

In [4]:
! cp kaggle.json ~/.kaggle/

In [6]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d shivanandmn/multilabel-classification-dataset

Downloading multilabel-classification-dataset.zip to /content
  0% 0.00/11.4M [00:00<?, ?B/s]
100% 11.4M/11.4M [00:00<00:00, 224MB/s]


In [7]:
!unzip multilabel-classification-dataset.zip

Archive:  multilabel-classification-dataset.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [8]:
import pandas as pd
import tensorflow as tf

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [9]:
train_df

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
20967,20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0
20968,20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0
20969,20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0
20970,20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0


In [10]:
test_df

Unnamed: 0,ID,TITLE,ABSTRACT
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...
2,20975,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...
4,20977,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...
...,...,...,...
8984,29957,Supporting mixed-datatype matrix multiplicatio...,We approach the problem of implementing mixe...
8985,29958,An axiomatic basis for Blackwell optimality,In the theory of Markov decision processes (...
8986,29959,GeneVis - An interactive visualization tool fo...,GeneVis is a web-based tool to visualize com...
8987,29960,Quantifying the causal effect of speed cameras...,This paper quantifies the effect of speed ca...


In [24]:
df_columns = train_df.columns
print(df_columns)
X_train = train_df[df_columns[1:3]]
y_train = train_df[df_columns[3:]]
X_test = test_df[df_columns[1:3]]
#y_test = test_df[df_columns[3:]]

Index(['ID', 'TITLE', 'ABSTRACT', 'Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance'],
      dtype='object')


In [27]:

def text_to_vectors(df, max_len=128, max_features=None, vocab_size=None):
  """
  Converts text columns in a DataFrame to vectors for neural network input.

  Args:
      df (pandas.DataFrame): DataFrame containing text columns.
      max_len (int, optional): Maximum length for padded sequences. Defaults to 100.
      max_features (int, optional): Maximum number of words to consider in the vocabulary.
                                   Overrides vocab_size if both are provided.
      vocab_size (int, optional): Size of the vocabulary to use. Defaults to None (all words).

  Returns:
      pandas.DataFrame: DataFrame with vectorized representations of text columns.
  """

  # Combine title and abstract for better representation (optional)
  text_data = df['TITLE'] + ' ' + df['ABSTRACT']

  # Tokenize text data
  tokenizer = Tokenizer(num_words=max_features if max_features else vocab_size)
  tokenizer.fit_on_texts(text_data)
  sequences = tokenizer.texts_to_sequences(text_data)

  # Pad sequences to same length
  padded_sequences = pad_sequences(sequences, maxlen=max_len)

  return padded_sequences


In [25]:
X_train = text_to_vectors(X_train)


[[  230    85    12 ... 13191     9    65]
 [    0     0     0 ...    28   364  2344]
 [    0     0     0 ...     1   990  2669]
 ...
 [    5    17  1118 ...     1  3454  2424]
 [    2   180   298 ...     6   206   847]
 [   10  8115  3446 ...     4  1498  2077]]


In [26]:
X_test = text_to_vectors(X_test)

[[    0     0     0 ...    11  3622   113]
 [  544     2   858 ...     2     1 15277]
 [11767  1484  2512 ...  2512  1093   194]
 ...
 [    0     0     0 ...     2  2531  2938]
 [    4  1533   211 ...   630  1847  3268]
 [    0     0     0 ...     1   121  3835]]


In [28]:
print(X_train.shape)
print(X_test.shape)

(20972, 128)
(8989, 128)


In [29]:
from tensorflow.keras.layers import Dense

def build_multi_label_model(input_shape, num_labels):
  """
  Builds a neural network model for multi-label classification.

  Args:
      input_shape: A tuple representing the shape of the input data.
      num_labels: The number of output labels.

  Returns:
      A compiled Keras model.
  """
  # Define the model architecture
  model = keras.Sequential([
      Dense(units=64, activation="relu", input_shape=input_shape),
      Dense(units=32, activation="relu"),
      Dense(16, activation="relu"),
      Dense(8, activation="relu"),
      # Output layer with sigmoid activation for multi-label
      Dense(units=num_labels, activation="sigmoid")
  ])

  # Compile the model with binary crossentropy loss
  model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

  return model

In [30]:
# Get the number of labels from the data
num_labels = y_train.shape[1]  # Assuming labels are one-hot encoded

# Build the model
model = build_multi_label_model(X_train.shape[1:], num_labels)

# Train the model
model.fit(X_train, y_train, epochs=12, batch_size=32)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.src.callbacks.History at 0x796e4d605b10>

In [31]:
predictions = model.predict(X_test)

