import libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

import dataset from kaggel

In [3]:
!pip install kaggle

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d shivanandmn/multilabel-classification-dataset

from zipfile import ZipFile
dataset='/content/multilabel-classification-dataset.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('data extracted')

Downloading multilabel-classification-dataset.zip to /content
  0% 0.00/11.4M [00:00<?, ?B/s]
100% 11.4M/11.4M [00:00<00:00, 164MB/s]
data extracted


data preparation

In [4]:
data_train=pd.read_csv('/content/train.csv')


In [7]:
features = data_train['ABSTRACT'] + ' ' + data_train['TITLE']


In [8]:
targets = data_train[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
print(targets)

       Computer Science  Physics  Mathematics  Statistics  \
0                     1        0            0           0   
1                     1        0            0           0   
2                     0        0            1           0   
3                     0        0            1           0   
4                     1        0            0           1   
...                 ...      ...          ...         ...   
20967                 1        1            0           0   
20968                 0        1            0           0   
20969                 1        0            0           0   
20970                 0        0            1           1   
20971                 0        0            1           1   

       Quantitative Biology  Quantitative Finance  
0                         0                     0  
1                         0                     0  
2                         0                     0  
3                         0                     0  
4      

tokanization

In [10]:
vectorizer = TfidfVectorizer(max_features=5000)
x = vectorizer.fit_transform(features)

convert targets to numpy array

In [11]:
targets=np.array(targets)

train test split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, targets, test_size=0.2, random_state=42)

In [14]:
print('x_train',x_train)

x_train [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
print('x_test',x_test)

x_test [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


build our model

In [17]:
def sigmoid ( x ) :
 return 1 / (1 + np . exp ( - x ) )

def sigmoid_derivative ( x ) :
 return x * (1 - x )

compile our model

In [20]:
class LinearNN:
    def __init__(self, n_inputs, n_outputs):
        self.weights = np.random.rand(n_inputs, n_outputs)
        self.bias = np.random.rand(n_outputs)

    def train(self, X, T, epochs, lr):
        for epoch in range(epochs):

            y_pred = self.predict(X)

            # Compute gradient
            d_weights = np.dot(X.T, (y_pred - T) * sigmoid_derivative(y_pred))
            d_bias = np.sum((y_pred - T) * sigmoid_derivative(y_pred), axis=0)


            self.weights -= lr * d_weights
            self.bias -= lr * d_bias

            if epoch % 100 == 0:
                loss = np.mean(-T * np.log(y_pred) - (1 - T) * np.log(1 - y_pred))
                print(f'Loss at epoch {epoch}: {loss}')

    def predict(self, X):
        return sigmoid(np.dot(X, self.weights) + self.bias)


model = LinearNN(x_train.shape[1], y_train.shape[1])
model.train(x_train, y_train, epochs=5000, lr=0.001)

Loss at epoch 0: 3.3245466418525114
Loss at epoch 100: 0.3695312723303342
Loss at epoch 200: 0.31662374849444375
Loss at epoch 300: 0.2876938511518868
Loss at epoch 400: 0.2697731738802322
Loss at epoch 500: 0.2573970557720262
Loss at epoch 600: 0.2481846631217689
Loss at epoch 700: 0.24096034474603134
Loss at epoch 800: 0.2350766315269479
Loss at epoch 900: 0.23014577983178353
Loss at epoch 1000: 0.2259201986892351
Loss at epoch 1100: 0.2222336664525279
Loss at epoch 1200: 0.21897002943736374
Loss at epoch 1300: 0.21604542121226758
Loss at epoch 1400: 0.2133976134084213
Loss at epoch 1500: 0.2109793514613357
Loss at epoch 1600: 0.20875402727035142
Loss at epoch 1700: 0.20669277960987004
Loss at epoch 1800: 0.204772498076304
Loss at epoch 1900: 0.20297441668085017
Loss at epoch 2000: 0.20128310294639806
Loss at epoch 2100: 0.1996857190184766
Loss at epoch 2200: 0.19817147426348075
Loss at epoch 2300: 0.19673121559626683
Loss at epoch 2400: 0.19535711875733988
Loss at epoch 2500: 0.1940

make prediction to get total accuracy

In [26]:
y_pred= np.round(model.predict(x_test))
accuracy = np.mean((y_pred == y_test).all(axis=1))
print(f'Accuracy: {accuracy}')

Accuracy: 0.6436233611442194
