In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import random

# Load the dataset
train_data = pd.read_csv('data/train.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess text data for Word2Vec
X_train_preprocessed = [text.split() for text in X_train]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_train_preprocessed, vector_size=100, window=5, min_count=1, workers=4)

# Function to convert text to word embeddings
def text_to_wv(text, model):
    vectors = []
    for word in text.split():
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Convert text data to word embeddings
X_train_wv = np.array([text_to_wv(text, word2vec_model) for text in X_train])
X_val_wv = np.array([text_to_wv(text, word2vec_model) for text in X_val])

# Helper function to identify minority labels
def get_tail_labels(y):
    tail_labels = [i for i in range(y.shape[1]) if np.sum(y[:, i]) < (y.shape[0] / 2)]
    return tail_labels

# class distribution before applying dynamic MLSMOTE
print("Class distribution before applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_train[:, i])}")

# Dynamic MLSMOTE function
def dynamic_MLSMOTE(X, y, target_balance=4500):
    n_neighbors = min(5, len(X) - 1)
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(X)
    tail_labels = get_tail_labels(y)
    synthetic_samples = []
    synthetic_labels = []

    for i in tail_labels:
        current_count = np.sum(y[:, i])
        n_samples = max(target_balance - current_count, 0)  # Calculate the number of samples to generate
        target_indices = np.where(y[:, i] == 1)[0]
        
        if len(target_indices) >= n_neighbors:
            nn = neigh.kneighbors(X[target_indices], return_distance=False)
            for _ in range(n_samples):
                sample_index = random.choice(range(len(target_indices)))
                nn_indices = nn[sample_index, 1:]
                chosen_nn = random.choice(nn_indices)
                step = np.random.rand()
                synthetic_sample = X[target_indices[sample_index]] + step * (X[chosen_nn] - X[target_indices[sample_index]])
                synthetic_samples.append(synthetic_sample)
                synthetic_label = y[target_indices[sample_index]].copy()
                synthetic_labels.append(synthetic_label)
    
    if len(synthetic_samples) > 0:
        X_synthetic = np.vstack(synthetic_samples)
        y_synthetic = np.vstack(synthetic_labels)
        X_balanced = np.vstack((X, X_synthetic))
        y_balanced = np.vstack((y, y_synthetic))
        return X_balanced, y_balanced
    else:
        return X, y

# Convert y_train to numpy array for processing
y_train_np = y_train

# Adjust this target balance
target_balance = 4500  
X_balanced_wv, y_balanced = dynamic_MLSMOTE(X_train_wv, y_train_np, target_balance=target_balance)

# class distribution after applying dynamic MLSMOTE
print("\n")
print("Class distribution after applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_balanced[:, i])}")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from skmultilearn.problem_transform import BinaryRelevance, LabelPowerset, ClassifierChain
import numpy as np

# Define the additional evaluation metrics functions
def geometric_mean_score(y_true, y_pred):
    gmean = np.sqrt(accuracy_score(y_true, y_pred, normalize=True) *
                     accuracy_score(y_true, y_pred, normalize=True))
    return gmean

def balanced_accuracy_score(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

def one_error(y_true, y_pred):
    incorrect = np.sum(np.logical_and((y_true == 0), (y_pred == 1)))
    one_err = incorrect / y_true.shape[0]
    return one_err


def coverage_error(y_true, y_pred):
    cov_err = np.mean(np.sum(y_pred, axis=1))
    return cov_err

In [None]:
# Define the base classifier
base_classifier = LogisticRegression(solver='liblinear')

# Binary Relevance
binary_relevance_classifier = BinaryRelevance(classifier=base_classifier)
binary_relevance_classifier.fit(X_balanced_wv, y_balanced)
y_pred_binary_relevance = binary_relevance_classifier.predict(X_val_wv)

# Label Powerset
label_powerset_classifier = LabelPowerset(classifier=base_classifier)
label_powerset_classifier.fit(X_balanced_wv, y_balanced)
y_pred_label_powerset = label_powerset_classifier.predict(X_val_wv)

# Classifier Chains
classifier_chain_classifier = ClassifierChain(classifier=base_classifier)
classifier_chain_classifier.fit(X_balanced_wv, y_balanced)
y_pred_classifier_chain = classifier_chain_classifier.predict(X_val_wv)

In [None]:
# Calculate evaluation metrics for Binary Relevance
accuracy_binary_relevance = accuracy_score(y_val, y_pred_binary_relevance)
hamming_loss_binary_relevance = hamming_loss(y_val, y_pred_binary_relevance)
f1_binary_relevance = f1_score(y_val, y_pred_binary_relevance, average='micro')
gmean_binary_relevance = geometric_mean_score(y_val, y_pred_binary_relevance)
one_error_binary_relevance = one_error(y_val, y_pred_binary_relevance)
coverage_binary_relevance = coverage_error(y_val, y_pred_binary_relevance)

# Calculate evaluation metrics for Label Powerset
accuracy_label_powerset = accuracy_score(y_val, y_pred_label_powerset)
hamming_loss_label_powerset = hamming_loss(y_val, y_pred_label_powerset)
f1_label_powerset = f1_score(y_val, y_pred_label_powerset, average='micro')
gmean_label_powerset = geometric_mean_score(y_val, y_pred_label_powerset)
one_error_label_powerset = one_error(y_val, y_pred_label_powerset)
coverage_label_powerset = coverage_error(y_val, y_pred_label_powerset)

# Calculate evaluation metrics for Classifier Chains
accuracy_classifier_chain = accuracy_score(y_val, y_pred_classifier_chain)
hamming_loss_classifier_chain = hamming_loss(y_val, y_pred_classifier_chain)
f1_classifier_chain = f1_score(y_val, y_pred_classifier_chain, average='micro')
gmean_classifier_chain = geometric_mean_score(y_val, y_pred_classifier_chain)
one_error_classifier_chain = one_error(y_val, y_pred_classifier_chain)
coverage_classifier_chain = coverage_error(y_val, y_pred_classifier_chain)

In [None]:
def ranking_loss(y_true, y_pred): 
    loss = 0
    for i in range(len(y_true)):
        diff = np.sum(np.abs(y_true[i] - y_pred[i]))
        loss += diff / (len(y_true[i]) * (len(y_true[i]) - 1))
    return loss / len(y_true)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics for Binary Relevance
accuracy_binary_relevance = accuracy_score(pd.DataFrame(y_val), y_pred_binary_relevance)
precision_binary_relevance = precision_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')
recall_binary_relevance = recall_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')
f1_binary_relevance = f1_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')

# Calculate evaluation metrics for Label Powerset
accuracy_label_powerset = accuracy_score(pd.DataFrame(y_val), y_pred_label_powerset)
precision_label_powerset = precision_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')
recall_label_powerset = recall_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')
f1_label_powerset = f1_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')

# Calculate evaluation metrics for Classifier Chains
accuracy_classifier_chain = accuracy_score(pd.DataFrame(y_val), y_pred_classifier_chain)
precision_classifier_chain = precision_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')
recall_classifier_chain = recall_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')
f1_classifier_chain = f1_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')


# # Display the evaluation metrics
# print("Binary Relevance:")
# print("Accuracy:", accuracy_binary_relevance)
# print("Precision:", precision_binary_relevance)
# print("Recall:", recall_binary_relevance)
# print("F1 Score:", f1_binary_relevance)
# print("Hamming Loss:", hamming_loss_binary_relevance)
# print("G-mean:", gmean_binary_relevance)
# #print("One-error:", one_error_binary_relevance)
# print("Coverage:", coverage_binary_relevance)

print("Binary Relevance:")
print("Accuracy:", round(accuracy_binary_relevance, 4))
print("Precision:", round(precision_binary_relevance, 4))
print("Recall:", round(recall_binary_relevance, 4))
print("F1 Score:", round(f1_binary_relevance, 4))
print("Hamming Loss:", round(hamming_loss_binary_relevance, 4))
print("G-mean:", round(gmean_binary_relevance, 4))
#print("One-error:", round(one_error_binary_relevance, 4))
print("Coverage:", round(coverage_binary_relevance, 4))



# print("\nLabel Powerset:")
# print("Accuracy:", accuracy_label_powerset)
# print("Precision:", precision_label_powerset)
# print("Recall:", recall_label_powerset)
# print("F1 Score:", f1_label_powerset)
# print("Hamming Loss:", hamming_loss_label_powerset)
# print("G-mean:", gmean_label_powerset)
# #print("One-error:", one_error_label_powerset)
# print("Coverage:", coverage_label_powerset)

print("\nLabel Powerset:")
print("Accuracy:", round(accuracy_label_powerset, 4))
print("Precision:", round(precision_label_powerset, 4))
print("Recall:", round(recall_label_powerset, 4))
print("F1 Score:", round(f1_label_powerset, 4))
print("Hamming Loss:", round(hamming_loss_label_powerset, 4))
print("G-mean:", round(gmean_label_powerset, 4))
#print("One-error:", one_error_label_powerset)
print("Coverage:", round(coverage_label_powerset, 4))

# print("\nClassifier Chains:")
# print("Accuracy:", accuracy_classifier_chain)
# print("Precision:", precision_classifier_chain)
# print("Recall:", recall_classifier_chain)
# print("F1 Score:", f1_classifier_chain)
# print("Hamming Loss:", hamming_loss_classifier_chain)
# print("G-mean:", gmean_classifier_chain)
# #print("One-error:", one_error_classifier_chain)
# print("Coverage:", coverage_classifier_chain)

print("\nClassifier Chains:")
print("Accuracy:", round(accuracy_classifier_chain, 4))
print("Precision:", round(precision_classifier_chain, 4))
print("Recall:", round(recall_classifier_chain, 4))
print("F1 Score:", round(f1_classifier_chain, 4))
print("Hamming Loss:", round(hamming_loss_classifier_chain, 4))
print("G-mean:", round(gmean_classifier_chain, 4))
#print("One-error:", one_error_classifier_chain)
print("Coverage:", round(coverage_classifier_chain, 4))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import random
from gensim.models import KeyedVectors

In [None]:
# Load the Google's pre-trained Word2Vec model
word2vec_path = '/Users/deepika/Documents/Masters/Courses/Practicum/GoogleNews-vectors-negative300.bin'  # Provide the path to the downloaded Word2Vec binary file
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
# Load the dataset
train_data = pd.read_csv('data/train.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to convert text to word embeddings
def text_to_wv(text, model):
    vectors = []
    for word in text.split():
        if word in model:
            vectors.append(model[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Convert text data to word embeddings
X_train_wv = np.array([text_to_wv(text, word2vec_model) for text in X_train])
X_val_wv = np.array([text_to_wv(text, word2vec_model) for text in X_val])

# Helper function to identify minority labels
def get_tail_labels(y):
    tail_labels = [i for i in range(y.shape[1]) if np.sum(y[:, i]) < (y.shape[0] / 2)]
    return tail_labels

# class distribution before applying dynamic MLSMOTE
print("Class distribution before applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_train[:, i])}")

# Dynamic MLSMOTE function
def dynamic_MLSMOTE(X, y, target_balance=4500):
    n_neighbors = min(5, len(X) - 1)
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(X)
    tail_labels = get_tail_labels(y)
    synthetic_samples = []
    synthetic_labels = []

    for i in tail_labels:
        current_count = np.sum(y[:, i])
        n_samples = max(target_balance - current_count, 0)  # Calculate the number of samples to generate
        target_indices = np.where(y[:, i] == 1)[0]
        
        if len(target_indices) >= n_neighbors:
            nn = neigh.kneighbors(X[target_indices], return_distance=False)
            for _ in range(n_samples):
                sample_index = random.choice(range(len(target_indices)))
                nn_indices = nn[sample_index, 1:]
                chosen_nn = random.choice(nn_indices)
                step = np.random.rand()
                synthetic_sample = X[target_indices[sample_index]] + step * (X[chosen_nn] - X[target_indices[sample_index]])
                synthetic_samples.append(synthetic_sample)
                synthetic_label = y[target_indices[sample_index]].copy()
                synthetic_labels.append(synthetic_label)
    
    if len(synthetic_samples) > 0:
        X_synthetic = np.vstack(synthetic_samples)
        y_synthetic = np.vstack(synthetic_labels)
        X_balanced = np.vstack((X, X_synthetic))
        y_balanced = np.vstack((y, y_synthetic))
        return X_balanced, y_balanced
    else:
        return X, y

# Convert y_train to numpy array for processing
y_train_np = y_train

# Adjust this target balance
target_balance = 4000 
X_balanced_wv, y_balanced = dynamic_MLSMOTE(X_train_wv, y_train_np, target_balance=target_balance)
#X_balanced_wv, y_balanced = dynamic_MLSMOTE(X_train, y_train_np, target_balance=target_balance)

# class distribution after applying dynamic MLSMOTE
print("\n")
print("Class distribution after applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_balanced[:, i])}")

In [None]:
print(type(X_balanced_wv))
print(type(y_balanced))
print(type(X_val_wv))
print(type(y_val))


np.save('data/X_balanced_wv_Word2Vec.npy', X_balanced_wv)
np.save('data/y_balanced_Word2Vec.npy', y_balanced)
np.save('data/X_val_wv_Word2Vec.npy', X_val_wv)
np.save('data/y_val_Word2Vec.npy', y_val)

X_balanced_wv = np.load('data/X_balanced_wv_Word2Vec.npy')
y_balanced = np.load('data/y_balanced_Word2Vec.npy')
X_val_wv = np.load('data/X_val_wv_Word2Vec.npy')
y_val = np.load('data/y_val_Word2Vec.npy')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from skmultilearn.problem_transform import BinaryRelevance, LabelPowerset, ClassifierChain
import numpy as np

# Define the additional evaluation metrics functions
def geometric_mean_score(y_true, y_pred):
    gmean = np.sqrt(accuracy_score(y_true, y_pred, normalize=True) *
                     accuracy_score(y_true, y_pred, normalize=True))
    return gmean

def balanced_accuracy_score(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

def one_error(y_true, y_pred):
    incorrect = np.sum(np.logical_and((y_true == 0), (y_pred == 1)))
    one_err = incorrect / y_true.shape[0]
    return one_err


def coverage_error(y_true, y_pred):
    cov_err = np.mean(np.sum(y_pred, axis=1))
    return cov_err

In [None]:
# Define the base classifier
base_classifier = LogisticRegression(solver='liblinear')

# Binary Relevance
binary_relevance_classifier = BinaryRelevance(classifier=base_classifier)
binary_relevance_classifier.fit(X_balanced_wv, y_balanced)
y_pred_binary_relevance = binary_relevance_classifier.predict(X_val_wv)

# Label Powerset
label_powerset_classifier = LabelPowerset(classifier=base_classifier)
label_powerset_classifier.fit(X_balanced_wv, y_balanced)
y_pred_label_powerset = label_powerset_classifier.predict(X_val_wv)

# Classifier Chains
classifier_chain_classifier = ClassifierChain(classifier=base_classifier)
classifier_chain_classifier.fit(X_balanced_wv, y_balanced)
y_pred_classifier_chain = classifier_chain_classifier.predict(X_val_wv)

In [None]:
# Calculate evaluation metrics for Binary Relevance
accuracy_binary_relevance = accuracy_score(y_val, y_pred_binary_relevance)
hamming_loss_binary_relevance = hamming_loss(y_val, y_pred_binary_relevance)
f1_binary_relevance = f1_score(y_val, y_pred_binary_relevance, average='micro')
gmean_binary_relevance = geometric_mean_score(y_val, y_pred_binary_relevance)
one_error_binary_relevance = one_error(y_val, y_pred_binary_relevance)
coverage_binary_relevance = coverage_error(y_val, y_pred_binary_relevance)

# Calculate evaluation metrics for Label Powerset
accuracy_label_powerset = accuracy_score(y_val, y_pred_label_powerset)
hamming_loss_label_powerset = hamming_loss(y_val, y_pred_label_powerset)
f1_label_powerset = f1_score(y_val, y_pred_label_powerset, average='micro')
gmean_label_powerset = geometric_mean_score(y_val, y_pred_label_powerset)
one_error_label_powerset = one_error(y_val, y_pred_label_powerset)
coverage_label_powerset = coverage_error(y_val, y_pred_label_powerset)

# Calculate evaluation metrics for Classifier Chains
accuracy_classifier_chain = accuracy_score(y_val, y_pred_classifier_chain)
hamming_loss_classifier_chain = hamming_loss(y_val, y_pred_classifier_chain)
f1_classifier_chain = f1_score(y_val, y_pred_classifier_chain, average='micro')
gmean_classifier_chain = geometric_mean_score(y_val, y_pred_classifier_chain)
one_error_classifier_chain = one_error(y_val, y_pred_classifier_chain)
coverage_classifier_chain = coverage_error(y_val, y_pred_classifier_chain)

In [None]:
def ranking_loss(y_true, y_pred): 
    loss = 0
    for i in range(len(y_true)):
        diff = np.sum(np.abs(y_true[i] - y_pred[i]))
        loss += diff / (len(y_true[i]) * (len(y_true[i]) - 1))
    return loss / len(y_true)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics for Binary Relevance
accuracy_binary_relevance = accuracy_score(pd.DataFrame(y_val), y_pred_binary_relevance)
precision_binary_relevance = precision_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')
recall_binary_relevance = recall_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')
f1_binary_relevance = f1_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')

# Calculate evaluation metrics for Label Powerset
accuracy_label_powerset = accuracy_score(pd.DataFrame(y_val), y_pred_label_powerset)
precision_label_powerset = precision_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')
recall_label_powerset = recall_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')
f1_label_powerset = f1_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')

# Calculate evaluation metrics for Classifier Chains
accuracy_classifier_chain = accuracy_score(pd.DataFrame(y_val), y_pred_classifier_chain)
precision_classifier_chain = precision_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')
recall_classifier_chain = recall_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')
f1_classifier_chain = f1_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')


# # Display the evaluation metrics
# print("Binary Relevance:")
# print("Accuracy:", accuracy_binary_relevance)
# print("Precision:", precision_binary_relevance)
# print("Recall:", recall_binary_relevance)
# print("F1 Score:", f1_binary_relevance)
# print("Hamming Loss:", hamming_loss_binary_relevance)
# print("G-mean:", gmean_binary_relevance)
# #print("One-error:", one_error_binary_relevance)
# print("Coverage:", coverage_binary_relevance)

print("Binary Relevance:")
print("Accuracy:", round(accuracy_binary_relevance, 4))
print("Precision:", round(precision_binary_relevance, 4))
print("Recall:", round(recall_binary_relevance, 4))
print("F1 Score:", round(f1_binary_relevance, 4))
print("Hamming Loss:", round(hamming_loss_binary_relevance, 4))
print("G-mean:", round(gmean_binary_relevance, 4))
#print("One-error:", round(one_error_binary_relevance, 4))
print("Coverage:", round(coverage_binary_relevance, 4))



# print("\nLabel Powerset:")
# print("Accuracy:", accuracy_label_powerset)
# print("Precision:", precision_label_powerset)
# print("Recall:", recall_label_powerset)
# print("F1 Score:", f1_label_powerset)
# print("Hamming Loss:", hamming_loss_label_powerset)
# print("G-mean:", gmean_label_powerset)
# #print("One-error:", one_error_label_powerset)
# print("Coverage:", coverage_label_powerset)

print("\nLabel Powerset:")
print("Accuracy:", round(accuracy_label_powerset, 4))
print("Precision:", round(precision_label_powerset, 4))
print("Recall:", round(recall_label_powerset, 4))
print("F1 Score:", round(f1_label_powerset, 4))
print("Hamming Loss:", round(hamming_loss_label_powerset, 4))
print("G-mean:", round(gmean_label_powerset, 4))
#print("One-error:", one_error_label_powerset)
print("Coverage:", round(coverage_label_powerset, 4))

# print("\nClassifier Chains:")
# print("Accuracy:", accuracy_classifier_chain)
# print("Precision:", precision_classifier_chain)
# print("Recall:", recall_classifier_chain)
# print("F1 Score:", f1_classifier_chain)
# print("Hamming Loss:", hamming_loss_classifier_chain)
# print("G-mean:", gmean_classifier_chain)
# #print("One-error:", one_error_classifier_chain)
# print("Coverage:", coverage_classifier_chain)

print("\nClassifier Chains:")
print("Accuracy:", round(accuracy_classifier_chain, 4))
print("Precision:", round(precision_classifier_chain, 4))
print("Recall:", round(recall_classifier_chain, 4))
print("F1 Score:", round(f1_classifier_chain, 4))
print("Hamming Loss:", round(hamming_loss_classifier_chain, 4))
print("G-mean:", round(gmean_classifier_chain, 4))
#print("One-error:", one_error_classifier_chain)
print("Coverage:", round(coverage_classifier_chain, 4))

# TF-IDF

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss,coverage_error, confusion_matrix
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import coverage_error
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import random

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import random

# Load the dataset
#train_data = pd.read_csv('train.csv')
train_data = pd.read_csv('data/train.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_val_tfidf = vectorizer.transform(X_val).toarray()

# Helper function to identify minority labels
def get_tail_labels(y):
    tail_labels = [i for i in range(y.shape[1]) if np.sum(y[:, i]) < (y.shape[0] / 2)]
    return tail_labels

# class distribution before applying dynamic MLSMOTE
print("Class distribution before applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_train[:, i])}")

# Dynamic MLSMOTE function
def dynamic_MLSMOTE(X, y, target_balance=4500):
    n_neighbors = min(5, len(X) - 1)
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(X)
    tail_labels = get_tail_labels(y)
    synthetic_samples = []
    synthetic_labels = []

    for i in tail_labels:
        current_count = np.sum(y[:, i])
        n_samples = max(target_balance - current_count, 0)  # Calculate the number of samples to generate
        target_indices = np.where(y[:, i] == 1)[0]
        
        if len(target_indices) >= n_neighbors:
            nn = neigh.kneighbors(X[target_indices], return_distance=False)
            for _ in range(n_samples):
                sample_index = random.choice(range(len(target_indices)))
                nn_indices = nn[sample_index, 1:]
                chosen_nn = random.choice(nn_indices)
                step = np.random.rand()
                synthetic_sample = X[target_indices[sample_index]] + step * (X[chosen_nn] - X[target_indices[sample_index]])
                synthetic_samples.append(synthetic_sample)
                synthetic_label = y[target_indices[sample_index]].copy()
                synthetic_labels.append(synthetic_label)
    
    if len(synthetic_samples) > 0:
        X_synthetic = np.vstack(synthetic_samples)
        y_synthetic = np.vstack(synthetic_labels)
        X_balanced = np.vstack((X, X_synthetic))
        y_balanced = np.vstack((y, y_synthetic))
        return X_balanced, y_balanced
    else:
        return X, y

# Convert y_train to numpy array for processing
y_train_np = y_train

# Adjust this target balance
target_balance = 4500  
X_balanced_tfidf, y_balanced = dynamic_MLSMOTE(X_train_tfidf, y_train_np, target_balance=target_balance)

# class distribution after applying dynamic MLSMOTE
print("\n")
print("Class distribution after applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_balanced[:, i])}")


Class distribution before applying dynamic MLSMOTE:
Computer Science: 6902
Physics: 4787
Mathematics: 4468
Statistics: 4137
Quantitative Biology: 465
Quantitative Finance: 204


Class distribution after applying dynamic MLSMOTE:
Computer Science: 7562
Physics: 4793
Mathematics: 4585
Statistics: 5727
Quantitative Biology: 4554
Quantitative Finance: 4526


In [3]:
print(type(X_balanced_tfidf))
print(type(y_balanced))
print(type(X_val_tfidf))
print(type(y_val))


np.save('data/X_balanced_tfidf.npy', X_balanced_tfidf)
np.save('data/y_balanced_tfidf.npy', y_balanced)
np.save('data/X_val_tfidf.npy', X_val_tfidf)
np.save('data/y_val_tfidf.npy', y_val)

X_balanced_tfidf = np.load('data/X_balanced_tfidf.npy')
y_balanced = np.load('data/y_balanced_tfidf.npy')
X_val_tfidf = np.load('data/X_val_tfidf.npy')
y_val = np.load('data/y_val_tfidf.npy')

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from skmultilearn.problem_transform import BinaryRelevance, LabelPowerset, ClassifierChain
import numpy as np

# Define the additional evaluation metrics functions
def geometric_mean_score(y_true, y_pred):
    gmean = np.sqrt(accuracy_score(y_true, y_pred, normalize=True) *
                     accuracy_score(y_true, y_pred, normalize=True))
    return gmean

def balanced_accuracy_score(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

def one_error(y_true, y_pred):
    incorrect = np.sum(np.logical_and((y_true == 0), (y_pred == 1)))
    one_err = incorrect / y_true.shape[0]
    return one_err


def coverage_error(y_true, y_pred):
    cov_err = np.mean(np.sum(y_pred, axis=1))
    return cov_err

In [5]:
# Define the base classifier
base_classifier = LogisticRegression(solver='liblinear')

# Binary Relevance
binary_relevance_classifier = BinaryRelevance(classifier=base_classifier)
binary_relevance_classifier.fit(X_balanced_tfidf, y_balanced)
y_pred_binary_relevance = binary_relevance_classifier.predict(X_val_tfidf)

# Label Powerset
label_powerset_classifier = LabelPowerset(classifier=base_classifier)
label_powerset_classifier.fit(X_balanced_tfidf, y_balanced)
y_pred_label_powerset = label_powerset_classifier.predict(X_val_tfidf)

# Classifier Chains
classifier_chain_classifier = ClassifierChain(classifier=base_classifier)
classifier_chain_classifier.fit(X_balanced_tfidf, y_balanced)
y_pred_classifier_chain = classifier_chain_classifier.predict(X_val_tfidf)

In [None]:
# Calculate evaluation metrics for Binary Relevance
accuracy_binary_relevance = accuracy_score(y_val, y_pred_binary_relevance)
hamming_loss_binary_relevance = hamming_loss(y_val, y_pred_binary_relevance)
f1_binary_relevance = f1_score(y_val, y_pred_binary_relevance, average='micro')
gmean_binary_relevance = geometric_mean_score(y_val, y_pred_binary_relevance)
one_error_binary_relevance = one_error(y_val, y_pred_binary_relevance)
coverage_binary_relevance = coverage_error(y_val, y_pred_binary_relevance)

# Calculate evaluation metrics for Label Powerset
accuracy_label_powerset = accuracy_score(y_val, y_pred_label_powerset)
hamming_loss_label_powerset = hamming_loss(y_val, y_pred_label_powerset)
f1_label_powerset = f1_score(y_val, y_pred_label_powerset, average='micro')
gmean_label_powerset = geometric_mean_score(y_val, y_pred_label_powerset)
one_error_label_powerset = one_error(y_val, y_pred_label_powerset)
coverage_label_powerset = coverage_error(y_val, y_pred_label_powerset)

# Calculate evaluation metrics for Classifier Chains
accuracy_classifier_chain = accuracy_score(y_val, y_pred_classifier_chain)
hamming_loss_classifier_chain = hamming_loss(y_val, y_pred_classifier_chain)
f1_classifier_chain = f1_score(y_val, y_pred_classifier_chain, average='micro')
gmean_classifier_chain = geometric_mean_score(y_val, y_pred_classifier_chain)
one_error_classifier_chain = one_error(y_val, y_pred_classifier_chain)
coverage_classifier_chain = coverage_error(y_val, y_pred_classifier_chain)

In [None]:
def ranking_loss(y_true, y_pred): 
    loss = 0
    for i in range(len(y_true)):
        diff = np.sum(np.abs(y_true[i] - y_pred[i]))
        loss += diff / (len(y_true[i]) * (len(y_true[i]) - 1))
    return loss / len(y_true)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics for Binary Relevance
accuracy_binary_relevance = accuracy_score(pd.DataFrame(y_val), y_pred_binary_relevance)
precision_binary_relevance = precision_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')
recall_binary_relevance = recall_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')
f1_binary_relevance = f1_score(pd.DataFrame(y_val), y_pred_binary_relevance, average='micro')

# Calculate evaluation metrics for Label Powerset
accuracy_label_powerset = accuracy_score(pd.DataFrame(y_val), y_pred_label_powerset)
precision_label_powerset = precision_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')
recall_label_powerset = recall_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')
f1_label_powerset = f1_score(pd.DataFrame(y_val), y_pred_label_powerset, average='micro')

# Calculate evaluation metrics for Classifier Chains
accuracy_classifier_chain = accuracy_score(pd.DataFrame(y_val), y_pred_classifier_chain)
precision_classifier_chain = precision_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')
recall_classifier_chain = recall_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')
f1_classifier_chain = f1_score(pd.DataFrame(y_val), y_pred_classifier_chain, average='micro')


# # Display the evaluation metrics
# print("Binary Relevance:")
# print("Accuracy:", accuracy_binary_relevance)
# print("Precision:", precision_binary_relevance)
# print("Recall:", recall_binary_relevance)
# print("F1 Score:", f1_binary_relevance)
# print("Hamming Loss:", hamming_loss_binary_relevance)
# print("G-mean:", gmean_binary_relevance)
# #print("One-error:", one_error_binary_relevance)
# print("Coverage:", coverage_binary_relevance)

print("Binary Relevance:")
print("Accuracy:", round(accuracy_binary_relevance, 4))
print("Precision:", round(precision_binary_relevance, 4))
print("Recall:", round(recall_binary_relevance, 4))
print("F1 Score:", round(f1_binary_relevance, 4))
print("Hamming Loss:", round(hamming_loss_binary_relevance, 4))
print("G-mean:", round(gmean_binary_relevance, 4))
#print("One-error:", round(one_error_binary_relevance, 4))
print("Coverage:", round(coverage_binary_relevance, 4))



# print("\nLabel Powerset:")
# print("Accuracy:", accuracy_label_powerset)
# print("Precision:", precision_label_powerset)
# print("Recall:", recall_label_powerset)
# print("F1 Score:", f1_label_powerset)
# print("Hamming Loss:", hamming_loss_label_powerset)
# print("G-mean:", gmean_label_powerset)
# #print("One-error:", one_error_label_powerset)
# print("Coverage:", coverage_label_powerset)

print("\nLabel Powerset:")
print("Accuracy:", round(accuracy_label_powerset, 4))
print("Precision:", round(precision_label_powerset, 4))
print("Recall:", round(recall_label_powerset, 4))
print("F1 Score:", round(f1_label_powerset, 4))
print("Hamming Loss:", round(hamming_loss_label_powerset, 4))
print("G-mean:", round(gmean_label_powerset, 4))
#print("One-error:", one_error_label_powerset)
print("Coverage:", round(coverage_label_powerset, 4))

# print("\nClassifier Chains:")
# print("Accuracy:", accuracy_classifier_chain)
# print("Precision:", precision_classifier_chain)
# print("Recall:", recall_classifier_chain)
# print("F1 Score:", f1_classifier_chain)
# print("Hamming Loss:", hamming_loss_classifier_chain)
# print("G-mean:", gmean_classifier_chain)
# #print("One-error:", one_error_classifier_chain)
# print("Coverage:", coverage_classifier_chain)

print("\nClassifier Chains:")
print("Accuracy:", round(accuracy_classifier_chain, 4))
print("Precision:", round(precision_classifier_chain, 4))
print("Recall:", round(recall_classifier_chain, 4))
print("F1 Score:", round(f1_classifier_chain, 4))
print("Hamming Loss:", round(hamming_loss_classifier_chain, 4))
print("G-mean:", round(gmean_classifier_chain, 4))
#print("One-error:", one_error_classifier_chain)
print("Coverage:", round(coverage_classifier_chain, 4))