In [None]:
!pip install scikit-multilearn
!pip install scikit-learn-extra



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss,coverage_error, confusion_matrix
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import coverage_error
from sklearn.neighbors import NearestNeighbors
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import random

# Load the dataset
#train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
train_data = pd.read_csv('data/train.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train a Word2Vec model
# sentences = [text.split() for text in X_train]
# word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Load the Google's pre-trained Word2Vec model
word2vec_path = 'GoogleNews-vectors-negative300.bin'  # Provide the path to the downloaded Word2Vec binary file
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
# Vectorize your text data using Word2Vec embeddings
def average_word_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0
    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

def get_average_vectors(data, model, num_features):
    return np.array([average_word_vectors(words, model, num_features) for words in data])

# Vectorize train and validation data
X_train_word2vec = get_average_vectors([text.split() for text in X_train], word2vec_model, 100)
X_val_word2vec = get_average_vectors([text.split() for text in X_val], word2vec_model, 100)

In [None]:

# Helper function to identify minority labels
def get_tail_labels(y):
    tail_labels = [i for i in range(y.shape[1]) if np.sum(y[:, i]) < (y.shape[0] / 2)]
    return tail_labels

# Dynamic MLSMOTE function
def dynamic_MLSMOTE(X, y, target_balance=4500):
    n_neighbors = min(5, len(X) - 1)
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(X)
    tail_labels = get_tail_labels(y)
    synthetic_samples = []
    synthetic_labels = []

    for i in tail_labels:
        current_count = np.sum(y[:, i])
        n_samples = max(target_balance - current_count, 0)  # Calculate the number of samples to generate
        target_indices = np.where(y[:, i] == 1)[0]

        if len(target_indices) >= n_neighbors:
            nn = neigh.kneighbors(X[target_indices], return_distance=False)
            for _ in range(n_samples):
                sample_index = random.choice(range(len(target_indices)))
                nn_indices = nn[sample_index, 1:]
                chosen_nn = random.choice(nn_indices)
                step = np.random.rand()
                synthetic_sample = X[target_indices[sample_index]] + step * (X[chosen_nn] - X[target_indices[sample_index]])
                synthetic_samples.append(synthetic_sample)
                synthetic_label = y[target_indices[sample_index]].copy()
                synthetic_labels.append(synthetic_label)

    if len(synthetic_samples) > 0:
        X_synthetic = np.vstack(synthetic_samples)
        y_synthetic = np.vstack(synthetic_labels)
        X_balanced = np.vstack((X, X_synthetic))
        y_balanced = np.vstack((y, y_synthetic))
        return X_balanced, y_balanced
    else:
        return X, y

In [None]:
y_train_np = y_train
target_balance = 4500
X_balanced_word2vec, y_balanced = dynamic_MLSMOTE(X_train_word2vec, y_train_np, target_balance=target_balance)


rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_balanced_word2vec, y_balanced)

y_pred_val = rf_classifier.predict(X_val_word2vec)

accuracy = accuracy_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val, average='weighted')
precision = precision_score(y_val, y_pred_val, average='weighted')
recall = recall_score(y_val, y_pred_val, average='weighted')
hamming = hamming_loss(y_val, y_pred_val)
coverage = coverage_error(y_val, y_pred_val)
tn, fp, fn, tp = confusion_matrix(y_val.ravel(), y_pred_val.ravel()).ravel()
g_mean = np.sqrt((tp / (tp + fn)) * (tn / (tn + fp)))

print("Accuracy: {:.4f}".format(accuracy))
print("F1 Score: {:.4f}".format(f1))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Hamming Loss: {:.4f}".format(hamming))
print("Coverage: {:.4f}".format(coverage))
print("G-Mean: {:.4f}".format(g_mean))

Accuracy: 0.5128
F1 Score: 0.6923
Precision: 0.8267
Recall: 0.6011
Hamming Loss: 0.1083
Coverage: 3.2405
G-Mean: 0.7633


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
# Binary Relevance
classifier_br = BinaryRelevance(RandomForestClassifier())
classifier_br.fit(X_balanced_word2vec, y_balanced)
y_pred_val_br = classifier_br.predict(X_val_word2vec)
accuracy_br = accuracy_score(y_val, y_pred_val_br)
f1_br = f1_score(y_val, y_pred_val_br, average='weighted')
precision_br = precision_score(y_val, y_pred_val_br, average='weighted')
recall_br = recall_score(y_val, y_pred_val_br, average='weighted')
hamming_br = hamming_loss(y_val, y_pred_val_br)
coverage_br = coverage_error(y_val, y_pred_val_br.toarray())
print("Binary Relevance:")
print("Accuracy: {:.4f}".format(accuracy_br))
print("F1 Score: {:.4f}".format(f1_br))
print("Precision: {:.4f}".format(precision_br))
print("Recall: {:.4f}".format(recall_br))
print("Hamming Loss: {:.4f}".format(hamming_br))
print("Coverage: {:.4f}".format(coverage_br))


Binary Relevance:
Accuracy: 0.5402
F1 Score: 0.7200
Precision: 0.8017
Recall: 0.6603
Hamming Loss: 0.1033
Coverage: 3.0217


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Classifier Chains
classifier_cc = ClassifierChain(RandomForestClassifier())
classifier_cc.fit(X_balanced_word2vec, y_balanced)
y_pred_val_cc = classifier_cc.predict(X_val_word2vec)
accuracy_cc = accuracy_score(y_val, y_pred_val_cc)
f1_cc = f1_score(y_val, y_pred_val_cc, average='weighted')
precision_cc = precision_score(y_val, y_pred_val_cc, average='weighted')
recall_cc = recall_score(y_val, y_pred_val_cc, average='weighted')
hamming_cc = hamming_loss(y_val, y_pred_val_cc)
coverage_cc = coverage_error(y_val, y_pred_val_cc.toarray())
print("Classifier Chains:")
print("Accuracy: {:.4f}".format(accuracy_cc))
print("F1 Score: {:.4f}".format(f1_cc))
print("Precision: {:.4f}".format(precision_cc))
print("Recall: {:.4f}".format(recall_cc))
print("Hamming Loss: {:.4f}".format(hamming_cc))
print("Coverage: {:.4f}".format(coverage_cc))


Classifier Chains:
Accuracy: 0.5650
F1 Score: 0.7327
Precision: 0.8087
Recall: 0.6733
Hamming Loss: 0.1067
Coverage: 2.9361


In [None]:
# Label Powerset
classifier_lp = LabelPowerset(RandomForestClassifier())
classifier_lp.fit(X_balanced_word2vec, y_balanced)
y_pred_val_lp = classifier_lp.predict(X_val_word2vec)
accuracy_lp = accuracy_score(y_val, y_pred_val_lp)
f1_lp = f1_score(y_val, y_pred_val_lp, average='weighted')
precision_lp = precision_score(y_val, y_pred_val_lp, average='weighted')
recall_lp = recall_score(y_val, y_pred_val_lp, average='weighted')
hamming_lp = hamming_loss(y_val, y_pred_val_lp)
coverage_lp = coverage_error(y_val, y_pred_val_lp.toarray())
print("Label Powerset:")
print("Accuracy: {:.4f}".format(accuracy_lp))
print("F1 Score: {:.4f}".format(f1_lp))
print("Precision: {:.4f}".format(precision_lp))
print("Recall: {:.4f}".format(recall_lp))
print("Hamming Loss: {:.4f}".format(hamming_lp))
print("Coverage: {:.4f}".format(coverage_lp))

Label Powerset:
Accuracy: 0.6086
F1 Score: 0.7291
Precision: 0.7843
Recall: 0.6927
Hamming Loss: 0.1083
Coverage: 2.8184
