In [1]:
import pandas as pd
# data manipulation, gathering
import numpy as np # for array manipulation# for dataframe manipulation/reading in data
import json # for reading in Data
from itertools import islice # for slicing and dicing JSON records
import os # for getting the filepath information
import re # to identify characters that are to be removed
import nltk # for preprocessing of textual data
from nltk.corpus import stopwords # for removing stopwords
from nltk.tokenize import word_tokenize # for tokenizing text
from nltk.stem import WordNetLemmatizer # for lemmatizing text
from sklearn.feature_extraction.text import TfidfVectorizer # for featurizing text
from sklearn.metrics.pairwise import cosine_similarity # for getting similarity score
from sklearn.decomposition import PCA #for dimensionality reduction
from sklearn.cluster import KMeans #for clustering
from sklearn.manifold import TSNE #For reducing to 2 dimensions for plotting

In [4]:
df = pd.read_csv('/Users/mac/Desktop/research_paper_recommendation/cs_papers_api.csv')
df.head()

Unnamed: 0,paper_id,title,abstract,year,primary_category,categories
0,2301.02657v1,TarViS: A Unified Approach for Target-based Vi...,The general domain of video segmentation is cu...,2023-01-06 18:59:52+00:00,cs.CV,cs.CV cs.AI cs.LG
1,2301.02642v1,Triple-stream Deep Metric Learning of Great Ap...,We propose the first metric learning system fo...,2023-01-06 18:36:04+00:00,cs.CV,cs.CV cs.AI cs.LG
2,2301.02610v1,Feedback-Gated Rectified Linear Units,Feedback connections play a prominent role in ...,2023-01-06 17:14:11+00:00,cs.NE,cs.NE cs.AI
3,2301.02593v1,Multi-Agent Reinforcement Learning for Fast-Ti...,To integrate high amounts of renewable energy ...,2023-01-06 16:41:51+00:00,cs.MA,cs.MA cs.AI cs.LG cs.SY
4,2301.02561v1,Multi-Vehicle Trajectory Prediction at Interse...,Traditional approaches to prediction of future...,2023-01-06 15:13:23+00:00,cs.RO,cs.RO cs.AI


In [5]:
# check for null values

df.isnull().sum()

paper_id            0
title               0
abstract            0
year                0
primary_category    0
categories          0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200094 entries, 0 to 200093
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   paper_id          200094 non-null  object
 1   title             200094 non-null  object
 2   abstract          200094 non-null  object
 3   year              200094 non-null  object
 4   primary_category  200094 non-null  object
 5   categories        200094 non-null  object
dtypes: object(6)
memory usage: 9.2+ MB


In [7]:
# join the title and abstract columns
df['text'] = df['title'] + ' ' + df['abstract']
df.head()

Unnamed: 0,paper_id,title,abstract,year,primary_category,categories,text
0,2301.02657v1,TarViS: A Unified Approach for Target-based Vi...,The general domain of video segmentation is cu...,2023-01-06 18:59:52+00:00,cs.CV,cs.CV cs.AI cs.LG,TarViS: A Unified Approach for Target-based Vi...
1,2301.02642v1,Triple-stream Deep Metric Learning of Great Ap...,We propose the first metric learning system fo...,2023-01-06 18:36:04+00:00,cs.CV,cs.CV cs.AI cs.LG,Triple-stream Deep Metric Learning of Great Ap...
2,2301.02610v1,Feedback-Gated Rectified Linear Units,Feedback connections play a prominent role in ...,2023-01-06 17:14:11+00:00,cs.NE,cs.NE cs.AI,Feedback-Gated Rectified Linear Units Feedback...
3,2301.02593v1,Multi-Agent Reinforcement Learning for Fast-Ti...,To integrate high amounts of renewable energy ...,2023-01-06 16:41:51+00:00,cs.MA,cs.MA cs.AI cs.LG cs.SY,Multi-Agent Reinforcement Learning for Fast-Ti...
4,2301.02561v1,Multi-Vehicle Trajectory Prediction at Interse...,Traditional approaches to prediction of future...,2023-01-06 15:13:23+00:00,cs.RO,cs.RO cs.AI,Multi-Vehicle Trajectory Prediction at Interse...


In [8]:
# remove the columns that are not needed
df.drop(columns=['year', 'primary_category', 'categories'], inplace=True)

In [9]:
df.head()

Unnamed: 0,paper_id,title,abstract,text
0,2301.02657v1,TarViS: A Unified Approach for Target-based Vi...,The general domain of video segmentation is cu...,TarViS: A Unified Approach for Target-based Vi...
1,2301.02642v1,Triple-stream Deep Metric Learning of Great Ap...,We propose the first metric learning system fo...,Triple-stream Deep Metric Learning of Great Ap...
2,2301.02610v1,Feedback-Gated Rectified Linear Units,Feedback connections play a prominent role in ...,Feedback-Gated Rectified Linear Units Feedback...
3,2301.02593v1,Multi-Agent Reinforcement Learning for Fast-Ti...,To integrate high amounts of renewable energy ...,Multi-Agent Reinforcement Learning for Fast-Ti...
4,2301.02561v1,Multi-Vehicle Trajectory Prediction at Interse...,Traditional approaches to prediction of future...,Multi-Vehicle Trajectory Prediction at Interse...


## Data Cleaning

In [10]:
# change the text to lowercase
df['text'] = df['text'].str.lower()

In [11]:
# remove punctuations 
def remove_punctuation(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

df['text'] = df['text'].apply(remove_punctuation)

In [12]:
# remove stop words
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

df['text'] = df['text'].apply(remove_stopwords)

In [13]:
df['tokenized_text'] = df['text'].apply(word_tokenize)

In [14]:
df.head()

Unnamed: 0,paper_id,title,abstract,text,tokenized_text
0,2301.02657v1,TarViS: A Unified Approach for Target-based Vi...,The general domain of video segmentation is cu...,tarvis unified approach targetbased video segm...,"[tarvis, unified, approach, targetbased, video..."
1,2301.02642v1,Triple-stream Deep Metric Learning of Great Ap...,We propose the first metric learning system fo...,triplestream deep metric learning great ape be...,"[triplestream, deep, metric, learning, great, ..."
2,2301.02610v1,Feedback-Gated Rectified Linear Units,Feedback connections play a prominent role in ...,feedbackgated rectified linear units feedback ...,"[feedbackgated, rectified, linear, units, feed..."
3,2301.02593v1,Multi-Agent Reinforcement Learning for Fast-Ti...,To integrate high amounts of renewable energy ...,multiagent reinforcement learning fasttimescal...,"[multiagent, reinforcement, learning, fasttime..."
4,2301.02561v1,Multi-Vehicle Trajectory Prediction at Interse...,Traditional approaches to prediction of future...,multivehicle trajectory prediction intersectio...,"[multivehicle, trajectory, prediction, interse..."


In [15]:
# stem text 
from nltk.stem import PorterStemmer


def stem_tokens(tokens):
    porter_stemmer = PorterStemmer()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['stemmed_sentence'] = df['tokenized_text'].apply(stem_tokens)

In [16]:
df.head()

Unnamed: 0,paper_id,title,abstract,text,tokenized_text,stemmed_sentence
0,2301.02657v1,TarViS: A Unified Approach for Target-based Vi...,The general domain of video segmentation is cu...,tarvis unified approach targetbased video segm...,"[tarvis, unified, approach, targetbased, video...",tarvi unifi approach targetbas video segment g...
1,2301.02642v1,Triple-stream Deep Metric Learning of Great Ap...,We propose the first metric learning system fo...,triplestream deep metric learning great ape be...,"[triplestream, deep, metric, learning, great, ...",triplestream deep metric learn great ape behav...
2,2301.02610v1,Feedback-Gated Rectified Linear Units,Feedback connections play a prominent role in ...,feedbackgated rectified linear units feedback ...,"[feedbackgated, rectified, linear, units, feed...",feedbackg rectifi linear unit feedback connect...
3,2301.02593v1,Multi-Agent Reinforcement Learning for Fast-Ti...,To integrate high amounts of renewable energy ...,multiagent reinforcement learning fasttimescal...,"[multiagent, reinforcement, learning, fasttime...",multiag reinforc learn fasttimescal demand res...
4,2301.02561v1,Multi-Vehicle Trajectory Prediction at Interse...,Traditional approaches to prediction of future...,multivehicle trajectory prediction intersectio...,"[multivehicle, trajectory, prediction, interse...",multivehicl trajectori predict intersect use s...


In [17]:
# drop unnecessary column
df.drop(columns=['tokenized_text'], inplace=True)

In [18]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['stemmed_sentence'])

In [19]:
# generate keywords for recommendation 

def extract_and_append_keywords(df, max_features=10, top_n_keywords=5):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)

    # Fit and transform the text data
    tfidf_matrix = vectorizer.fit_transform(df['stemmed_sentence'])

    # Get feature names (keywords) and their scores
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()

    # Add a new "keywords" column to the DataFrame
    df['keywords'] = pd.Series([None] * len(df))

    # Extract and append top N keywords for each document
    for i in range(len(df)):
        top_keywords = [(feature_names[j], scores[i][j]) for j in scores[i].argsort()[-top_n_keywords:][::-1]]
        df.loc[i, 'keywords'] = ", ".join([keyword[0] for keyword in top_keywords])

    return df

# Apply the function to DataFrame
df = extract_and_append_keywords(df, max_features=15, top_n_keywords=10)


df.head()


Unnamed: 0,paper_id,title,abstract,text,stemmed_sentence,keywords
0,2301.02657v1,TarViS: A Unified Approach for Target-based Vi...,The general domain of video segmentation is cu...,tarvis unified approach targetbased video segm...,tarvi unifi approach targetbas video segment g...,"approach, model, gener, perform, network, meth..."
1,2301.02642v1,Triple-stream Deep Metric Learning of Great Ap...,We propose the first metric learning system fo...,triplestream deep metric learning great ape be...,triplestream deep metric learn great ape behav...,"learn, propos, result, comput, approach, perfo..."
2,2301.02610v1,Feedback-Gated Rectified Linear Units,Feedback connections play a prominent role in ...,feedbackgated rectified linear units feedback ...,feedbackg rectifi linear unit feedback connect...,"network, perform, propos, use, result, problem..."
3,2301.02593v1,Multi-Agent Reinforcement Learning for Fast-Ti...,To integrate high amounts of renewable energy ...,multiagent reinforcement learning fasttimescal...,multiag reinforc learn fasttimescal demand res...,"learn, perform, approach, gener, propos, resul..."
4,2301.02561v1,Multi-Vehicle Trajectory Prediction at Interse...,Traditional approaches to prediction of future...,multivehicle trajectory prediction intersectio...,multivehicl trajectori predict intersect use s...,"approach, use, network, result, propos, proble..."


In [29]:
def get_recommendations(value, tfidf_vectorizer, tfidf_matrix, df, num_recommendations):
    if value in df['paper_id'].values:
        # If input_text is a valid paper_id, retrieve keywords from the DataFrame
        paper_id = value
        keywords = df[df['paper_id'] == paper_id]['keywords'].values[0]
    else:
        # If input_text is not a valid paper_id, assume it is keywords
        keywords = value.lower()
        keywords = re.sub(r"[^\w\s]", "", keywords)

    # Transform the input keywords into a TF-IDF vector
    keyword_vector = tfidf_vectorizer.transform([keywords]).toarray()
    keyword_vector = keyword_vector[:, :tfidf_matrix.shape[1]]

    # Calculate cosine similarities between the keyword vector and all articles
    similarities = cosine_similarity(keyword_vector, tfidf_matrix)

    # Get the indices of the top N recommendations
    top_n_idx = np.argsort(-similarities, axis=1)[:, :num_recommendations].flatten()

    # Return the recommended article titles
    rec_paper_id_title = df.loc[top_n_idx, ['paper_id', 'title', 'abstract']]
    return rec_paper_id_title

# Example usage:
recommendations = get_recommendations('network', tfidf_vectorizer, tfidf_matrix, df, 5)
recommendations.head()

Unnamed: 0,paper_id,title,abstract
127577,2209.08294v1,A Survey on the Network Models applied in the ...,Network architecture design is very important ...
132102,1805.07746v2,Network Reconstruction and Controlling Based o...,"From the perspective of network analysis, the ..."
199704,1703.06231v1,Network Comparison: Embeddings and Interiors,This paper presents methods to compare network...
155770,1609.09739v1,"Graphs, Ideal Flow, and the Transportation Net...",This lecture discusses the mathematical relati...
131307,1904.08547v1,Deep Representation Learning for Social Networ...,Social network analysis is an important proble...


In [21]:
tfidf_vectorizer2 = TfidfVectorizer(max_features=10000)

# Generate the tf-idf vectors for the data
tfidf_matrix2 = tfidf_vectorizer2.fit_transform(df['stemmed_sentence'])

In [30]:
rec = get_recommendations('2301.02401v1',tfidf_vectorizer, tfidf_matrix,df,1000)
idxs = list(df[df['paper_id'].isin(rec['paper_id'])].index)
rec_matrix = tfidf_matrix2[idxs]


In [33]:
rec_matrix.shape

(1000, 10000)

In [31]:
pca = PCA(n_components=0.95, random_state=42) #Keep 95% of the variance
reduced_matrix = pca.fit_transform(rec_matrix.toarray())

In [None]:
k = 10 # selectable
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(reduced_matrix)



In [None]:
tsne = TSNE(perplexity=100, random_state=42)
two_dim_matrix = tsne.fit_transform(reduced_matrix)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

## Topic Modelling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizers = []
    
for x in range(0, k):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [None]:
topic_df = pd.DataFrame()
topic_df['id'] =df[df['paper_id'].isin(rec['paper_id'])]['paper_id']
topic_df['title'] = df[df['paper_id'].isin(rec['paper_id'])]['title']
topic_df['text'] = df[df['paper_id'].isin(rec['paper_id'])]['title']+" "+df[df['paper_id'].isin(rec['paper_id'])]['abstract']
topic_df['cluster'] = y_pred


In [None]:
topic_df.head()

Unnamed: 0,id,title,text,cluster
117,2301.00519v1,Holistic Network Virtualization and Pervasive ...,Holistic Network Virtualization and Pervasive ...,7
199,2212.14278v1,Industrial Scene Change Detection using Deep C...,Industrial Scene Change Detection using Deep C...,4
207,2301.00676v1,Multimodal Sequential Generative Models for Se...,Multimodal Sequential Generative Models for Se...,0
323,2212.12070v1,RouteNet-Fermi: Network Modeling with Graph Ne...,RouteNet-Fermi: Network Modeling with Graph Ne...,3
410,2212.10649v1,Inversion of Bayesian Networks,Inversion of Bayesian Networks Variational aut...,7


In [None]:
vectorized_data = []

for current_cluster, cvec in enumerate(vectorizers):
    try:
        vectorized_data.append(cvec.fit_transform(topic_df.loc[topic_df['cluster'] == current_cluster, 'text']))
    except Exception as e:
        print("Not enough instances in cluster: " + str(current_cluster))
        vectorized_data.append(None)

In [None]:
NUM_TOPICS_PER_CLUSTER = 5 #choose

lda_models = []
for x in range(0, k):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online',verbose=False, random_state=42)
    lda_models.append(lda)

In [None]:
clusters_lda_data = []

for current_cluster, lda in enumerate(lda_models):
    #print("Current Cluster: " + str(current_cluster))
    
    if vectorized_data[current_cluster] != None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))


In [None]:
def selected_topics(model, vectorizer, top_n=3):
    current_words = []
    keywords = []
    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])
                
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for x in keywords:
        return_values.append(x[0])
    return " ".join(return_values)


In [None]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    #print("Current Cluster: " + str(current_vectorizer))

    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))


In [None]:
cluster_keyword = {x:all_keywords[x] for x in range(k)}
word_pred = list(map(cluster_keyword.get, y_pred))
topic_df['keywords'] = word_pred


In [None]:
fig = px.scatter(topic_df, x=two_dim_matrix[:,0], y=two_dim_matrix[:,1], color='keywords',
                 hover_data=['id','title'],
                 height= 700, width=1200,
                title = "Clustered Papers",)

fig.update_layout(
    coloraxis_colorbar=dict(
        title_font=dict(size=20), # Set the font size of the color legend title to 20
        tickfont=dict(size=16) # Set the font size of the color legend tick labels to 16
    ),
    legend=dict(
        font=dict(size=11) # Set the font size of the marker legend to 16
    )
)
fig.update_layout(
    legend=dict(orientation="h", yanchor="bottom", y=-0.3, xanchor="right", x=1)
)
fig.show()

In [None]:

# dataframe for the recommendation. 
df.to_csv('cleaned_data.csv', index=True)