In [4]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import random
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
train_data = pd.read_csv('data/train.csv')
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

X = train_data['TEXT']
y = train_data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Split the dataset
X_train, X_test, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Transform the input features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Helper function to identify minority labels
def get_tail_labels(y):
    tail_labels = [i for i in range(y.shape[1]) if np.sum(y[:, i]) < (y.shape[0] / 2)]
    return tail_labels

# class distribution before applying dynamic MLSMOTE
print("Class distribution before applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_train[:, i])}")

# Dynamic MLSMOTE function
def dynamic_MLSMOTE(X, y, target_balance=4500):
    n_neighbors = min(5, X.shape[0] - 1)
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(X)
    tail_labels = get_tail_labels(y)
    synthetic_samples = []
    synthetic_labels = []

    for i in tail_labels:
        current_count = np.sum(y[:, i])
        n_samples = max(target_balance - current_count, 0)  # Calculate the number of samples to generate
        target_indices = np.where(y[:, i] == 1)[0]
        
        if len(target_indices) >= n_neighbors:
            nn = neigh.kneighbors(X[target_indices], return_distance=False)
            for _ in range(n_samples):
                sample_index = random.choice(range(len(target_indices)))
                nn_indices = nn[sample_index, 1:]
                chosen_nn = random.choice(nn_indices)
                step = np.random.rand()
                synthetic_sample = X[target_indices[sample_index]] + step * (X[chosen_nn] - X[target_indices[sample_index]])
                synthetic_samples.append(synthetic_sample)
                synthetic_label = y[target_indices[sample_index]].copy()
                synthetic_labels.append(synthetic_label)
    
    if len(synthetic_samples) > 0:
        X_synthetic = np.vstack(synthetic_samples)
        y_synthetic = np.vstack(synthetic_labels)
        X_balanced = np.vstack((X, X_synthetic))
        y_balanced = np.vstack((y, y_synthetic))
        return X_balanced, y_balanced
    else:
        return X, y

# Convert y_train to numpy array for processing
y_train_np = y_train

# Adjust this target balance
target_balance = 4500  
X_balanced_wv, y_balanced = dynamic_MLSMOTE(X_train_tfidf, y_train_np, target_balance=target_balance)

# class distribution after applying dynamic MLSMOTE
print("\n")
print("Class distribution after applying dynamic MLSMOTE:")
for i, label in enumerate(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']):
    print(f"{label}: {np.sum(y_balanced[:, i])}")


Class distribution before applying dynamic MLSMOTE:
Computer Science: 6902
Physics: 4787
Mathematics: 4468
Statistics: 4137
Quantitative Biology: 465
Quantitative Finance: 204


Class distribution after applying dynamic MLSMOTE:
Computer Science: 7570
Physics: 4801
Mathematics: 4575
Statistics: 5683
Quantitative Biology: 4557
Quantitative Finance: 4536
