In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'multilabel-classification-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1123189%2F1885658%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240318%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240318T190145Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D79876489112b5722ff5effcc59fe776eac8a30835a09cf0fb38b8e23c10217da4ffb55446e081183dac20e176eec416c7ec4ca8187bd3820ee832f06bb67e75e4076759ae7144c57f051a4a413b1a110eb23d5efcfc05455f3ce5b45e1f080d9edaf9c94188089b8f9135fea0eba415d087b6b227550f0b9f83c239431558b26ccb8e2cbe7c69b930adda806da35de2ee93a446bda377163c5f164724a6185d64e796d86e74b37773916aeee290427054886de10841dca0df3497bb9064f132c18e7d13c8506b39e4871643a3a86597acfaf6493e31cd513e013aae88f713e8fe84a1147585cafde076282ab1618e9555fbf55ee88b95069e5501c04b8e4a1c8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

train_df = pd.read_csv("/kaggle/input/multilabel-classification-dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/multilabel-classification-dataset/test.csv")


def multi_hot(labels, num_labels):
    y_multi_hot = np.zeros(num_labels)
    for label in labels:
        y_multi_hot[label] = 1
    return y_multi_hot

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


abstracts = train_df['ABSTRACT'].values
labels = train_df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(abstracts).toarray()

num_labels = 6
Y = np.array([multi_hot(label, num_labels) for label in labels])

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


num_features = X.shape[1]
num_labels = 6
W = np.random.randn(num_features, num_labels)
b = np.zeros(num_labels)

# Hyperparameters
learning_rate = 0.00003
epochs = 10

train_losses = []
train_accuracies = []

# Training loop
for epoch in range(epochs):
    train_loss = 0
    train_correct = 0
    for x, y_true in zip(X_train, Y_train):
        # Forward propagation
        z = np.dot(x, W) + b
        y_pred = sigmoid(z)

        # Loss calculation
        loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        train_loss += loss

        # Accuracy calculation
        y_pred_binary = np.around(y_pred)
        train_correct += np.sum(y_pred_binary == y_true)

        # Backward propagation
        dz = y_pred - y_true
        dW = np.outer(x, dz)
        db = dz

        # Parameter update
        W -= learning_rate * dW
        b -= learning_rate * db

    # Calculate average loss and accuracy for training set
    train_loss /= len(X_train)
    train_accuracy = train_correct / (len(X_train) * num_labels)
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    # Print loss and accuracy every few epochs
    if epoch % 1 == 0:
        print(f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

print(f"Final Train Loss: {train_losses[-1]:.4f}, Final Train Accuracy: {train_accuracies[-1]:.4f}")

Downloading multilabel-classification-dataset, 12001191 bytes compressed
Downloaded and uncompressed: multilabel-classification-dataset
Data source import complete.
Epoch 0, Train Loss: 4.4773, Train Accuracy: 0.5451
Epoch 1, Train Loss: 3.7528, Train Accuracy: 0.6481
Epoch 2, Train Loss: 3.1833, Train Accuracy: 0.7352
Epoch 3, Train Loss: 2.7331, Train Accuracy: 0.8032
Epoch 4, Train Loss: 2.3739, Train Accuracy: 0.8552
Epoch 5, Train Loss: 2.0842, Train Accuracy: 0.8931
Epoch 6, Train Loss: 1.8479, Train Accuracy: 0.9213
Epoch 7, Train Loss: 1.6530, Train Accuracy: 0.9414
Epoch 8, Train Loss: 1.4904, Train Accuracy: 0.9558
Epoch 9, Train Loss: 1.3534, Train Accuracy: 0.9668
Final Train Loss: 1.3534, Final Train Accuracy: 0.9668
