In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, hamming_loss
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from skmultilearn.adapt import MLkNN


train_data = pd.read_csv('train1.csv')
test_data = pd.read_csv('test.csv')

# Combine TITLE and ABSTRACT into a single 'Text' column
train_data['Text'] = train_data['TITLE'] + ' ' + train_data['ABSTRACT']

# Remove stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

train_data['Text'] = train_data['Text'].apply(lambda x: remove_stopwords(x))

# Stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stem_sentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stem_sentence += stem
        stem_sentence += " "
    stem_sentence = stem_sentence.strip()
    return stem_sentence

train_data['Text'] = train_data['Text'].apply(stemming)

# Define categories
categories = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(train_data['Text'], train_data[categories], test_size=0.2, random_state=40, shuffle=True)

# TF-IDF Transformation
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Convert labels to NumPy array
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

# Multilabel Text Classification using MLkNN
# Initialize the MLkNN classifier
classifier = MLkNN(k=10)  # You can adjust the value of k as needed

# Fit the classifier on the TF-IDF transformed training data
classifier.fit(x_train_tfidf, y_train_np)

# Predict the labels for the test data
predictions = classifier.predict(x_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test_np, predictions)
hamming_loss_value = hamming_loss(y_test_np, predictions)

print(f"Accuracy: {accuracy}")
print(f"Hamming Loss: {hamming_loss_value}")


Accuracy: 0.6164481525625745
Hamming Loss: 0.09368295589988081
