In [94]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import string
import nltk
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score



In [95]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/espensele/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [96]:

data_path = 'data'
train_file = 'arxiv_train.csv'

train_path = f'{data_path}/{train_file}'

In [97]:
df_train = pd.read_csv(train_path)
df_train = df_train.dropna()
X = df_train["abstract"]
y = df_train["label"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [98]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()

    stop_words = set(stopwords.words('english'))

    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

In [99]:
def preprocess_text_series(text_series: pd.Series) -> pd.Series:
    return text_series.apply(clean_text)

In [100]:
X_train_clean = preprocess_text_series(X_train)
X_val_clean = preprocess_text_series(X_val)

In [101]:
print(X_train_clean.head())

15871    prototype largesized telescope lst cherenkov t...
77097    segment routing ipv6 srv6 short networking sol...
2817     consider problem identifying stable sets mutua...
64539    study rotor walk deterministic counterpart sim...
28536    conference paper present gambit global modular...
Name: abstract, dtype: object


In [102]:
print(f"\nTraining set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")


Training set size: 64000
Validation set size: 16000


In [103]:
count_vectorizer = CountVectorizer(max_features=5000)
X_train_count_vec = count_vectorizer.fit_transform(X_train_clean)
# X_val_count_vec = count_vectorizer.transform(X_val_clean) Not validated on the validation set, but test set


In [104]:
mlp_CountVectorizer_model = MLPClassifier(
    max_iter=200,              # max iterations at 50, 200 default
    random_state=42,
    verbose=True               # Enable progress messages
)

In [105]:
mlp_CountVectorizer_model.fit(X_train_count_vec, y_train)

Iteration 1, loss = 0.70701257
Iteration 2, loss = 0.41558451
Iteration 3, loss = 0.32354242
Iteration 4, loss = 0.25080606
Iteration 5, loss = 0.18888044
Iteration 6, loss = 0.13394026
Iteration 7, loss = 0.09111206
Iteration 8, loss = 0.05964045
Iteration 9, loss = 0.03794497
Iteration 10, loss = 0.02445235
Iteration 11, loss = 0.01649871
Iteration 12, loss = 0.01159696
Iteration 13, loss = 0.00866787
Iteration 14, loss = 0.00671864
Iteration 15, loss = 0.00537779
Iteration 16, loss = 0.00441007
Iteration 17, loss = 0.00368112
Iteration 18, loss = 0.00313037
Iteration 19, loss = 0.00270891
Iteration 20, loss = 0.00238832
Iteration 21, loss = 0.00213814
Iteration 22, loss = 0.00194031
Iteration 23, loss = 0.00178223
Iteration 24, loss = 0.00165673
Iteration 25, loss = 0.00155846
Iteration 26, loss = 0.00147637
Iteration 27, loss = 0.00140981
Iteration 28, loss = 0.00135418
Iteration 29, loss = 0.00130622
Iteration 30, loss = 0.00126456
Iteration 31, loss = 0.00122806
Iteration 32, los

# Reading the test set

In [106]:
#Load the test set
test_file = 'arxiv_test.csv'
test_path = f'{data_path}/{test_file}'
df_test = pd.read_csv(test_path)
df_test = df_test.dropna()

X_test = df_test["abstract"]
y_test = df_test["label"]

X_test_clean = preprocess_text_series(X_test)

In [107]:
X_test_count = count_vectorizer.transform(X_test_clean)
y_test_pred = mlp_CountVectorizer_model.predict(X_test_count)

In [108]:

accuracy_test_Count = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy_test_Count:.4f}")

Test Accuracy: 0.7844


In [109]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_clean)
# X_val_tfidf = tfidf_vectorizer.transform(X_val_clean) Not validated on the validation set, but test set

In [110]:
mlp_TFIDFVectorizer_model = MLPClassifier(
    max_iter=200,               # max iterations at 200 default
    random_state=42,
    verbose=True               # Enable progress messages
)

In [111]:
mlp_TFIDFVectorizer_model.fit(X_train_tfidf, y_train)  # Use TF-IDF features

Iteration 1, loss = 1.05343503
Iteration 2, loss = 0.48231288
Iteration 3, loss = 0.40716897
Iteration 4, loss = 0.36208713
Iteration 5, loss = 0.32904035
Iteration 6, loss = 0.30192126
Iteration 7, loss = 0.27832805
Iteration 8, loss = 0.25823136
Iteration 9, loss = 0.23921568
Iteration 10, loss = 0.22188867
Iteration 11, loss = 0.20475255
Iteration 12, loss = 0.18871222
Iteration 13, loss = 0.17330708
Iteration 14, loss = 0.15838485
Iteration 15, loss = 0.14398416
Iteration 16, loss = 0.12977791
Iteration 17, loss = 0.11644768
Iteration 18, loss = 0.10409646
Iteration 19, loss = 0.09239820
Iteration 20, loss = 0.08122795
Iteration 21, loss = 0.07124473
Iteration 22, loss = 0.06211287
Iteration 23, loss = 0.05402098
Iteration 24, loss = 0.04707852
Iteration 25, loss = 0.04092678
Iteration 26, loss = 0.03563773
Iteration 27, loss = 0.03114094
Iteration 28, loss = 0.02731591
Iteration 29, loss = 0.02417863
Iteration 30, loss = 0.02157519
Iteration 31, loss = 0.01928945
Iteration 32, los

In [112]:
X_test_idf = tfidf_vectorizer.transform(X_test_clean)
y_test_pred_tdIDF = mlp_TFIDFVectorizer_model.predict(X_test_idf)

In [113]:
accuracy_test_idf = accuracy_score(y_test, y_test_pred_tdIDF)
print(f"Test Accuracy: {accuracy_test_idf:.4f}")

Test Accuracy: 0.7816


In [114]:
#CountAccuracy: 0.7844, default layers
#TFAccuracy: 0.7816, default layers