In [None]:
!pip install scikit-multilearn
!pip install scikit-learn-extra

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss,coverage_error, confusion_matrix
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import coverage_error
from sklearn.neighbors import NearestNeighbors
from gensim.models import Word2Vec
import random

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/practicum/data.csv')

In [None]:
# Load the dataset
train_data = pd.read_csv('/content/drive/MyDrive/practicum/data.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Word2Vec model
sentences = [text.split() for text in X_train]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Vectorize your text data using Word2Vec embeddings
def average_word_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0
    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

def get_average_vectors(data, model, num_features):
    return np.array([average_word_vectors(words, model, num_features) for words in data])

# Vectorize train and validation data
X_train_word2vec = get_average_vectors([text.split() for text in X_train], word2vec_model, 100)
X_val_word2vec = get_average_vectors([text.split() for text in X_val], word2vec_model, 100)

In [None]:
# Helper function to identify minority labels
def get_tail_labels(y):
    tail_labels = [i for i in range(y.shape[1]) if np.sum(y[:, i]) < (y.shape[0] / 2)]
    return tail_labels

# Dynamic MLSMOTE function
def dynamic_MLSMOTE(X, y, target_balance=4500):
    n_neighbors = min(5, len(X) - 1)
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(X)
    tail_labels = get_tail_labels(y)
    synthetic_samples = []
    synthetic_labels = []

    for i in tail_labels:
        current_count = np.sum(y[:, i])
        n_samples = max(target_balance - current_count, 0)  # Calculate the number of samples to generate
        target_indices = np.where(y[:, i] == 1)[0]

        if len(target_indices) >= n_neighbors:
            nn = neigh.kneighbors(X[target_indices], return_distance=False)
            for _ in range(n_samples):
                sample_index = random.choice(range(len(target_indices)))
                nn_indices = nn[sample_index, 1:]
                chosen_nn = random.choice(nn_indices)
                step = np.random.rand()
                synthetic_sample = X[target_indices[sample_index]] + step * (X[chosen_nn] - X[target_indices[sample_index]])
                synthetic_samples.append(synthetic_sample)
                synthetic_label = y[target_indices[sample_index]].copy()
                synthetic_labels.append(synthetic_label)

    if len(synthetic_samples) > 0:
        X_synthetic = np.vstack(synthetic_samples)
        y_synthetic = np.vstack(synthetic_labels)
        X_balanced = np.vstack((X, X_synthetic))
        y_balanced = np.vstack((y, y_synthetic))
        return X_balanced, y_balanced
    else:
        return X, y

In [None]:
y_train_np = y_train
target_balance = 4500
X_balanced_word2vec, y_balanced = dynamic_MLSMOTE(X_train_word2vec, y_train_np, target_balance=target_balance)


rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_balanced_word2vec, y_balanced)

y_pred_val = rf_classifier.predict(X_val_word2vec)

accuracy = accuracy_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val, average='weighted')
precision = precision_score(y_val, y_pred_val, average='weighted')
recall = recall_score(y_val, y_pred_val, average='weighted')
hamming = hamming_loss(y_val, y_pred_val)
coverage = coverage_error(y_val, y_pred_val)
tn, fp, fn, tp = confusion_matrix(y_val.ravel(), y_pred_val.ravel()).ravel()
g_mean = np.sqrt((tp / (tp + fn)) * (tn / (tn + fp)))

print("Accuracy: {:.4f}".format(accuracy))
print("F1 Score: {:.4f}".format(f1))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Hamming Loss: {:.4f}".format(hamming))
print("Coverage: {:.4f}".format(coverage))
print("G-Mean: {:.4f}".format(g_mean))

In [None]:
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
# Binary Relevance
classifier_br = BinaryRelevance(RandomForestClassifier())
classifier_br.fit(X_balanced_word2vec, y_balanced)
y_pred_val_br = classifier_br.predict(X_val_word2vec)
accuracy_br = accuracy_score(y_val, y_pred_val_br)
f1_br = f1_score(y_val, y_pred_val_br, average='weighted')
precision_br = precision_score(y_val, y_pred_val_br, average='weighted')
recall_br = recall_score(y_val, y_pred_val_br, average='weighted')
hamming_br = hamming_loss(y_val, y_pred_val_br)
coverage_br = coverage_error(y_val, y_pred_val_br.toarray())
print("Binary Relevance:")
print("Accuracy: {:.4f}".format(accuracy_br))
print("F1 Score: {:.4f}".format(f1_br))
print("Precision: {:.4f}".format(precision_br))
print("Recall: {:.4f}".format(recall_br))
print("Hamming Loss: {:.4f}".format(hamming_br))
print("Coverage: {:.4f}".format(coverage_br))


In [None]:
# Classifier Chains
classifier_cc = ClassifierChain(RandomForestClassifier())
classifier_cc.fit(X_balanced_word2vec, y_balanced)
y_pred_val_cc = classifier_cc.predict(X_val_word2vec)
accuracy_cc = accuracy_score(y_val, y_pred_val_cc)
f1_cc = f1_score(y_val, y_pred_val_cc, average='weighted')
precision_cc = precision_score(y_val, y_pred_val_cc, average='weighted')
recall_cc = recall_score(y_val, y_pred_val_cc, average='weighted')
hamming_cc = hamming_loss(y_val, y_pred_val_cc)
coverage_cc = coverage_error(y_val, y_pred_val_cc.toarray())
print("Classifier Chains:")
print("Accuracy: {:.4f}".format(accuracy_cc))
print("F1 Score: {:.4f}".format(f1_cc))
print("Precision: {:.4f}".format(precision_cc))
print("Recall: {:.4f}".format(recall_cc))
print("Hamming Loss: {:.4f}".format(hamming_cc))
print("Coverage: {:.4f}".format(coverage_cc))


In [None]:
# Label Powerset
classifier_lp = LabelPowerset(RandomForestClassifier())
classifier_lp.fit(X_balanced_word2vec, y_balanced)
y_pred_val_lp = classifier_lp.predict(X_val_word2vec)
accuracy_lp = accuracy_score(y_val, y_pred_val_lp)
f1_lp = f1_score(y_val, y_pred_val_lp, average='weighted')
precision_lp = precision_score(y_val, y_pred_val_lp, average='weighted')
recall_lp = recall_score(y_val, y_pred_val_lp, average='weighted')
hamming_lp = hamming_loss(y_val, y_pred_val_lp)
coverage_lp = coverage_error(y_val, y_pred_val_lp.toarray())
print("Label Powerset:")
print("Accuracy: {:.4f}".format(accuracy_lp))
print("F1 Score: {:.4f}".format(f1_lp))
print("Precision: {:.4f}".format(precision_lp))
print("Recall: {:.4f}".format(recall_lp))
print("Hamming Loss: {:.4f}".format(hamming_lp))
print("Coverage: {:.4f}".format(coverage_lp))

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import ClassifierChain
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [None]:
# Label Powerset SVM
classifier_lp = LabelPowerset(SVC(class_weight='balanced'))
classifier_lp.fit(X_balanced_word2vec, y_balanced)
y_pred_val_lp = classifier_lp.predict(X_val_word2vec)
accuracy_lp = accuracy_score(y_val, y_pred_val_lp)
f1_lp = f1_score(y_val, y_pred_val_lp, average='weighted')
precision_lp = precision_score(y_val, y_pred_val_lp, average='weighted')
recall_lp = recall_score(y_val, y_pred_val_lp, average='weighted')
hamming_lp = hamming_loss(y_val, y_pred_val_lp)
coverage_lp = coverage_error(y_val, y_pred_val_lp.toarray())
print("Label Powerset:")
print("Accuracy: {:.4f}".format(accuracy_lp))
print("F1 Score: {:.4f}".format(f1_lp))
print("Precision: {:.4f}".format(precision_lp))
print("Recall: {:.4f}".format(recall_lp))
print("Hamming Loss: {:.4f}".format(hamming_lp))
print("Coverage: {:.4f}".format(coverage_lp))

In [None]:
# Binary Relevance SVM
classifier_br = BinaryRelevance(SVC(class_weight='balanced'))
classifier_br.fit(X_balanced_word2vec, y_balanced)
y_pred_val_br = classifier_br.predict(X_val_word2vec)
accuracy_br = accuracy_score(y_val, y_pred_val_br)
f1_br = f1_score(y_val, y_pred_val_br, average='weighted')
precision_br = precision_score(y_val, y_pred_val_br, average='weighted')
recall_br = recall_score(y_val, y_pred_val_br, average='weighted')
hamming_br = hamming_loss(y_val, y_pred_val_br)
coverage_br = coverage_error(y_val, y_pred_val_br.toarray())
print("Binary Relevance:")
print("Accuracy: {:.4f}".format(accuracy_br))
print("F1 Score: {:.4f}".format(f1_br))
print("Precision: {:.4f}".format(precision_br))
print("Recall: {:.4f}".format(recall_br))
print("Hamming Loss: {:.4f}".format(hamming_br))
print("Coverage: {:.4f}".format(coverage_br))


In [None]:
# Binary Relevance SVM
# classifier_br = BinaryRelevance()
classifier_cc = ClassifierChain(SVC(class_weight='balanced'))
classifier_cc.fit(X_balanced_word2vec, y_balanced)
y_pred_val_cc = classifier_cc.predict(X_val_word2vec)
accuracy_cc = accuracy_score(y_val, y_pred_val_cc)
f1_cc = f1_score(y_val, y_pred_val_cc, average='weighted')
precision_cc = precision_score(y_val, y_pred_val_cc, average='weighted')
recall_cc = recall_score(y_val, y_pred_val_cc, average='weighted')
hamming_cc = hamming_loss(y_val, y_pred_val_cc)
# coverage_cc = coverage_error(y_val, y_pred_val_cc.toarray())
print("Classifier Chains:")
print("Accuracy: {:.4f}".format(accuracy_cc))
print("F1 Score: {:.4f}".format(f1_cc))
print("Precision: {:.4f}".format(precision_cc))
print("Recall: {:.4f}".format(recall_cc))
print("Hamming Loss: {:.4f}".format(hamming_cc))
# print("Coverage: {:.4f}".format(coverage_cc))

In [None]:
print("Classifier Chains:")
coverage_cc = coverage_error(y_val, y_pred_val_cc)
print("Accuracy: {:.4f}".format(accuracy_cc))
print("F1 Score: {:.4f}".format(f1_cc))
print("Precision: {:.4f}".format(precision_cc))
print("Recall: {:.4f}".format(recall_cc))
print("Hamming Loss: {:.4f}".format(hamming_cc))
print("Coverage: {:.4f}".format(coverage_cc))

In [None]:

# from skmultilearn.adapt import MLkNN
# from sklearn.model_selection import GridSearchCV

# # parameters = {'k': range(1,3), 's': [0.5, 0.7, 1.0]}

# clf = MLkNN(k=10)
# clf.fit(X_balanced_word2vec, y_balanced)

# print (clf.best_params_, clf.best_score_)

# Pre-trained WordtoVec

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss,coverage_error, confusion_matrix
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import coverage_error
from sklearn.neighbors import NearestNeighbors
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import random

# Load the dataset
#train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
train_data = pd.read_csv('data/train.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train a Word2Vec model
# sentences = [text.split() for text in X_train]
# word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Load the Google's pre-trained Word2Vec model
word2vec_path = '/Users/deepika/Documents/Masters/Courses/Practicum/GoogleNews-vectors-negative300.bin'  # Provide the path to the downloaded Word2Vec binary file
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)


In [None]:
def get_average_vectors(data, model, num_features):
    return np.array([average_word_vectors(words, model, num_features) for words in data])

def average_word_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0
    for word in words:
        if word in model:
            n_words += 1
            feature_vector = np.add(feature_vector, model[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

# Replace '300' with the actual dimensionality of your word vectors
X_train_word2vec = get_average_vectors([text.split() for text in X_train], word2vec_model, 300)
X_val_word2vec = get_average_vectors([text.split() for text in X_val], word2vec_model, 300)


In [None]:

# Helper function to identify minority labels
def get_tail_labels(y):
    tail_labels = [i for i in range(y.shape[1]) if np.sum(y[:, i]) < (y.shape[0] / 2)]
    return tail_labels

# Dynamic MLSMOTE function
def dynamic_MLSMOTE(X, y, target_balance=4500):
    n_neighbors = min(5, len(X) - 1)
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(X)
    tail_labels = get_tail_labels(y)
    synthetic_samples = []
    synthetic_labels = []

    for i in tail_labels:
        current_count = np.sum(y[:, i])
        n_samples = max(target_balance - current_count, 0)  # Calculate the number of samples to generate
        target_indices = np.where(y[:, i] == 1)[0]

        if len(target_indices) >= n_neighbors:
            nn = neigh.kneighbors(X[target_indices], return_distance=False)
            for _ in range(n_samples):
                sample_index = random.choice(range(len(target_indices)))
                nn_indices = nn[sample_index, 1:]
                chosen_nn = random.choice(nn_indices)
                step = np.random.rand()
                synthetic_sample = X[target_indices[sample_index]] + step * (X[chosen_nn] - X[target_indices[sample_index]])
                synthetic_samples.append(synthetic_sample)
                synthetic_label = y[target_indices[sample_index]].copy()
                synthetic_labels.append(synthetic_label)

    if len(synthetic_samples) > 0:
        X_synthetic = np.vstack(synthetic_samples)
        y_synthetic = np.vstack(synthetic_labels)
        X_balanced = np.vstack((X, X_synthetic))
        y_balanced = np.vstack((y, y_synthetic))
        return X_balanced, y_balanced
    else:
        return X, y

In [None]:
y_train_np = y_train
target_balance = 4500
X_balanced_word2vec, y_balanced = dynamic_MLSMOTE(X_train_word2vec, y_train_np, target_balance=target_balance)

In [None]:
# Label Powerset SVM
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
classifier_lp = LabelPowerset(SVC(class_weight='balanced'))
classifier_lp.fit(X_balanced_word2vec, y_balanced)
y_pred_val_lp = classifier_lp.predict(X_val_word2vec)
accuracy_lp = accuracy_score(y_val, y_pred_val_lp)
f1_lp = f1_score(y_val, y_pred_val_lp, average='weighted')
precision_lp = precision_score(y_val, y_pred_val_lp, average='weighted')
recall_lp = recall_score(y_val, y_pred_val_lp, average='weighted')
hamming_lp = hamming_loss(y_val, y_pred_val_lp)
coverage_lp = coverage_error(y_val, y_pred_val_lp.toarray())
print("Label Powerset:")
print("Accuracy: {:.4f}".format(accuracy_lp))
print("F1 Score: {:.4f}".format(f1_lp))
print("Precision: {:.4f}".format(precision_lp))
print("Recall: {:.4f}".format(recall_lp))
print("Hamming Loss: {:.4f}".format(hamming_lp))
print("Coverage: {:.4f}".format(coverage_lp))

In [None]:
# Binary Relevance SVM
classifier_br = BinaryRelevance(SVC(class_weight='balanced'))
classifier_br.fit(X_balanced_word2vec, y_balanced)
y_pred_val_br = classifier_br.predict(X_val_word2vec)
accuracy_br = accuracy_score(y_val, y_pred_val_br)
f1_br = f1_score(y_val, y_pred_val_br, average='weighted')
precision_br = precision_score(y_val, y_pred_val_br, average='weighted')
recall_br = recall_score(y_val, y_pred_val_br, average='weighted')
hamming_br = hamming_loss(y_val, y_pred_val_br)
coverage_br = coverage_error(y_val, y_pred_val_br.toarray())
print("Binary Relevance:")
print("Accuracy: {:.4f}".format(accuracy_br))
print("F1 Score: {:.4f}".format(f1_br))
print("Precision: {:.4f}".format(precision_br))
print("Recall: {:.4f}".format(recall_br))
print("Hamming Loss: {:.4f}".format(hamming_br))
print("Coverage: {:.4f}".format(coverage_br))


In [None]:
# Binary Relevance SVM
# classifier_br = BinaryRelevance()
classifier_cc = ClassifierChain(SVC(class_weight='balanced'))
classifier_cc.fit(X_balanced_word2vec, y_balanced)
y_pred_val_cc = classifier_cc.predict(X_val_word2vec)
accuracy_cc = accuracy_score(y_val, y_pred_val_cc)
f1_cc = f1_score(y_val, y_pred_val_cc, average='weighted')
precision_cc = precision_score(y_val, y_pred_val_cc, average='weighted')
recall_cc = recall_score(y_val, y_pred_val_cc, average='weighted')
hamming_cc = hamming_loss(y_val, y_pred_val_cc)
coverage_cc = coverage_error(y_val, y_pred_val_cc.toarray())
print("Classifier Chains:")
print("Accuracy: {:.4f}".format(accuracy_cc))
print("F1 Score: {:.4f}".format(f1_cc))
print("Precision: {:.4f}".format(precision_cc))
print("Recall: {:.4f}".format(recall_cc))
print("Hamming Loss: {:.4f}".format(hamming_cc))
#coverage_cc = coverage_error(y_val, y_pred_val_cc)
print("Coverage: {:.4f}".format(coverage_cc))

# TF-IDF

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss,coverage_error, confusion_matrix
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import coverage_error
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import random

# Load the dataset
#train_data = pd.read_csv('train.csv')
train_data = pd.read_csv('data/train.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_val_tfidf = vectorizer.transform(X_val).toarray()

# Helper function to identify minority labels
def get_tail_labels(y):
    tail_labels = [i for i in range(y.shape[1]) if np.sum(y[:, i]) < (y.shape[0] / 2)]
    return tail_labels

# class distribution before applying dynamic MLSMOTE
print("Class distribution before applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_train[:, i])}")

# Dynamic MLSMOTE function
def dynamic_MLSMOTE(X, y, target_balance=4500):
    n_neighbors = min(5, len(X) - 1)
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(X)
    tail_labels = get_tail_labels(y)
    synthetic_samples = []
    synthetic_labels = []

    for i in tail_labels:
        current_count = np.sum(y[:, i])
        n_samples = max(target_balance - current_count, 0)  # Calculate the number of samples to generate
        target_indices = np.where(y[:, i] == 1)[0]
        
        if len(target_indices) >= n_neighbors:
            nn = neigh.kneighbors(X[target_indices], return_distance=False)
            for _ in range(n_samples):
                sample_index = random.choice(range(len(target_indices)))
                nn_indices = nn[sample_index, 1:]
                chosen_nn = random.choice(nn_indices)
                step = np.random.rand()
                synthetic_sample = X[target_indices[sample_index]] + step * (X[chosen_nn] - X[target_indices[sample_index]])
                synthetic_samples.append(synthetic_sample)
                synthetic_label = y[target_indices[sample_index]].copy()
                synthetic_labels.append(synthetic_label)
    
    if len(synthetic_samples) > 0:
        X_synthetic = np.vstack(synthetic_samples)
        y_synthetic = np.vstack(synthetic_labels)
        X_balanced = np.vstack((X, X_synthetic))
        y_balanced = np.vstack((y, y_synthetic))
        return X_balanced, y_balanced
    else:
        return X, y

# Convert y_train to numpy array for processing
y_train_np = y_train

# Adjust this target balance
target_balance = 4500  
X_balanced_tfidf, y_balanced = dynamic_MLSMOTE(X_train_tfidf, y_train_np, target_balance=target_balance)

# class distribution after applying dynamic MLSMOTE
print("\n")
print("Class distribution after applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_balanced[:, i])}")


Class distribution before applying dynamic MLSMOTE:
Computer Science: 6902
Physics: 4787
Mathematics: 4468
Statistics: 4137
Quantitative Biology: 465
Quantitative Finance: 204


Class distribution after applying dynamic MLSMOTE:
Computer Science: 7576
Physics: 4799
Mathematics: 4598
Statistics: 5720
Quantitative Biology: 4570
Quantitative Finance: 4525


In [None]:
y_train_np = y_train
target_balance = 4500
#X_balanced_tfidf, y_balanced = dynamic_MLSMOTE(X_balanced_tfidf, y_train_np, target_balance=target_balance)

In [None]:
# Label Powerset SVM
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
classifier_lp = LabelPowerset(SVC(class_weight='balanced'))
classifier_lp.fit(X_balanced_tfidf, y_balanced)
y_pred_val_lp = classifier_lp.predict(X_val_tfidf)
accuracy_lp = accuracy_score(y_val, y_pred_val_lp)
f1_lp = f1_score(y_val, y_pred_val_lp, average='weighted')
precision_lp = precision_score(y_val, y_pred_val_lp, average='weighted')
recall_lp = recall_score(y_val, y_pred_val_lp, average='weighted')
hamming_lp = hamming_loss(y_val, y_pred_val_lp)
coverage_lp = coverage_error(y_val, y_pred_val_lp.toarray())
print("Label Powerset:")
print("Accuracy: {:.4f}".format(accuracy_lp))
print("F1 Score: {:.4f}".format(f1_lp))
print("Precision: {:.4f}".format(precision_lp))
print("Recall: {:.4f}".format(recall_lp))
print("Hamming Loss: {:.4f}".format(hamming_lp))
print("Coverage: {:.4f}".format(coverage_lp))

In [None]:
# Binary Relevance SVM
classifier_br = BinaryRelevance(SVC(class_weight='balanced'))
classifier_br.fit(X_balanced_tfidf, y_balanced)
y_pred_val_br = classifier_br.predict(X_val_tfidf)
accuracy_br = accuracy_score(y_val, y_pred_val_br)
f1_br = f1_score(y_val, y_pred_val_br, average='weighted')
precision_br = precision_score(y_val, y_pred_val_br, average='weighted')
recall_br = recall_score(y_val, y_pred_val_br, average='weighted')
hamming_br = hamming_loss(y_val, y_pred_val_br)
coverage_br = coverage_error(y_val, y_pred_val_br.toarray())
print("Binary Relevance:")
print("Accuracy: {:.4f}".format(accuracy_br))
print("F1 Score: {:.4f}".format(f1_br))
print("Precision: {:.4f}".format(precision_br))
print("Recall: {:.4f}".format(recall_br))
print("Hamming Loss: {:.4f}".format(hamming_br))
print("Coverage: {:.4f}".format(coverage_br))


In [None]:
# Classifier Chains SVM
# classifier_br = BinaryRelevance()
classifier_cc = ClassifierChain(SVC(class_weight='balanced'))
classifier_cc.fit(X_balanced_tfidf, y_balanced)
y_pred_val_cc = classifier_cc.predict(X_val_tfidf)
accuracy_cc = accuracy_score(y_val, y_pred_val_cc)
f1_cc = f1_score(y_val, y_pred_val_cc, average='weighted')
precision_cc = precision_score(y_val, y_pred_val_cc, average='weighted')
recall_cc = recall_score(y_val, y_pred_val_cc, average='weighted')
hamming_cc = hamming_loss(y_val, y_pred_val_cc)
coverage_cc = coverage_error(y_val, y_pred_val_cc.toarray())
print("Classifier Chains:")
print("Accuracy: {:.4f}".format(accuracy_cc))
print("F1 Score: {:.4f}".format(f1_cc))
print("Precision: {:.4f}".format(precision_cc))
print("Recall: {:.4f}".format(recall_cc))
print("Hamming Loss: {:.4f}".format(hamming_cc))
#coverage_cc = coverage_error(y_val, y_pred_val_cc)
print("Coverage: {:.4f}".format(coverage_cc))