# Project Publications

The objective of this experiment is to employ the semantic information encapsulated in the titles of research papers along with associated research fields. The goal is to transform the paper titles into 2D points (projecting them to a 2D research space), thereby facilitating a reduced and meaningful representation.


In [8]:
import ast
import numpy as np
import pandas as pd
from tqdm import tqdm

import nltk
import string
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

porter = PorterStemmer()

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

def preprocess_title(title):
    words = nltk.word_tokenize(title)
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words]
    words = [word.lower() for word in words]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word and word not in stop_words]
    words = [porter.stem(word) for word in words]
    preprocessed_title = ' '.join(words)
    return preprocessed_title

# Preprocess titles and concatenate them with corresponding research fields (multilabel classification)
titles = []
for i, row in df.iterrows():
    title = row['Title']
    title_research_fields = ', '.join(ast.literal_eval(row['Research Fields']))
    combined_title = f"{title} {title_research_fields}"
    combined_title = preprocess_title(combined_title)
    titles.append(combined_title)

# Calculate TF-IDF matrix (rows are titles and columns are words)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(titles)

# Reduce dimensionality of each title (row) to 2 using TSNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X.toarray())

# Add the projection of the papers to the 2D reserach space to the df
projected_publications_df = df.copy()
projected_publications_df['2D Points'] = X_tsne.tolist()
projected_publications_df[['x', 'y']] = pd.DataFrame(X_tsne.tolist(), index=projected_publications_df.index)
projected_publications_df[['x', 'y']] = projected_publications_df[['x', 'y']].astype(float)

# Save new data frame
projected_publications_df.to_csv('./../dat/ProjectedPublications.csv', index=False)