# Content-Based Research Paper Recommender #

## Prepare Tools ##

In [1]:
import pandas as pd
import numpy as np

import os
from fnmatch import fnmatch
from pathlib import PurePath

import urllib.request # download files 
import subprocess # operating system command

import re

np.random.seed(0)

## Prepare Data ##

### Download & unzip research paper dataset ###

In [2]:
# set dataset path
working_dir = "/home/lee/Documents/Datasets for GitHub/scholarly_paper_recommendation/"

# download file
url = 'http://www.comp.nus.edu.sg/%7Esugiyama/SchRecData/20100825-SchPaperRecData.zip'  
urllib.request.urlretrieve(url, working_dir+'20100825-SchPaperRecData.zip')

# unzip file
subprocess.run(["unzip", working_dir+"20100825-SchPaperRecData.zip", "-d", working_dir])  

del url
os.remove( working_dir+'20100825-SchPaperRecData.zip')

### Load all the vectors that each represents a candidate paper to recommend ###

In [2]:
def all_file_names_in_list(folder_name, pattern):
    all_rec_name = []
    for path, subdirs, files in os.walk(folder_name):
        for name in files:
            if fnmatch(name, pattern):
                all_rec_name.append(PurePath(path, name))
                
    return all_rec_name

In [5]:
# list of the full path of all files in the paper directory
all_paper_files = all_file_names_in_list(working_dir+'20100825-SchPaperRecData/RecCandidatePapersFV/', "*_recfv.txt")
# list, convert PurePath object to string
all_paper_files = [str(x) for x in all_paper_files]
# list of tuples, [(paper id like "P00-1001", paper file full path), (paper id like "P00-1002", paper file full path)]
all_paper_files = list(zip([x[117:125] for x in all_paper_files], all_paper_files))

# structure of all_df_original (which is a list):
# all_df_original[0] is a dataframe with 2 columns
# token tfidf_weight
# AAA 0.01
all_df_original = [pd.read_csv(file, names=['token', 'tfidf_weight'], \
                 delim_whitespace=True, index_col=False, header=None) for file in list(zip(*all_paper_files))[1]]

# list of dataframes
# all_rec_paper[0] is dataframe with 1 column, token is column name
# AAA
# 0.01
all_rec_paper = [x.set_index('token').transpose().rename_axis('', axis="columns").reset_index(drop=True)\
                     for x in all_df_original]

# for i in range(len(all_rec_paper)):
#     if (all_rec_paper[i].columns[0] == 'a'):
#         if (all_rec_paper[i].loc[0, 'a'] > 0.5):
#             all_rec_paper[i].drop('a', axis=1, inplace=True)
        
del all_df_original

### Load all the vectors that each represents a researcher's interest ###

Here a researcher's past published papers represent their research interest.

In [7]:
# list of the full path of all files in the junior researcher directory
all_researcher_files = all_file_names_in_list(working_dir+'20100825-SchPaperRecData/JuniorR/', "*-?_fv.txt")
# list, convert PurePath object to string
all_researcher_files = [str(x) for x in all_researcher_files]
# list of tuples, the regex extracts the researcher id from path,
# then [(researcher id like "y1", paper file full path), (researcher id like "y2", paper file full path)]
all_researcher_files = list(zip([re.findall(re.escape(working_dir+"20100825-SchPaperRecData/JuniorR/")\
                                            +"([^/]+)"+re.escape("/"), x)[0] for x in all_researcher_files],\
                                all_researcher_files))

# list of dataframes
# all_researcher_original[0] is a dataframe with 2 columns
# token tf_weight
# AAA 0.01
all_researcher_original = [pd.read_csv(file, names=['token', 'tf_weight'], \
                 delim_whitespace=True, index_col=False, header=None) for file in list(zip(*all_researcher_files))[1]]
# list of dataframes
# all_researcher_paper[0] is dataframe with 1 column, token is column name
# AAA
# 0.01
all_researcher_paper = [x.set_index('token').transpose().rename_axis('', axis="columns").reset_index(drop=True)\
                     for x in all_researcher_original]

del all_researcher_original

## Make Recommendations ##

### Helper ###

In [8]:
def cosine_similarity(x, y):
    """calculate cosine similarity based on formula from wikipedia
    https://en.wikipedia.org/wiki/Cosine_similarity
    """
    try:
        numerator =  np.sqrt(x.dot(y))
    except AttributeError:
        numerator = 0
        
    denominator = np.linalg.norm(x) * np.linalg.norm(y)
    
    try:
        cosine_similarity = round(numerator/float(denominator), 3)
    except ZeroDivisionError:
        cosine_similarity = 0

    return cosine_similarity

### Calculate similarity scores between a researcher's interest and all candidate papers ###

In [19]:
def n_most_similar_papers_selected_researcher(n_most, researcher):
    rec_score = dict()
    
    # find where the selected researcher is in the list all_researcher_paper
    researcher_index = list(zip(*all_researcher_files))[0].index(researcher)
    
    for i in range(len(all_rec_paper)):
        together = None
        together = pd.concat([all_rec_paper[i], all_researcher_paper[researcher_index]], axis=0, join='inner')   
        if (len(together.iloc[0, :]) > 0) & (len(together.iloc[1, :]) > 0):
            rec_score[all_paper_files[i][0]] = cosine_similarity(together.iloc[0, :], together.iloc[1, :])
        else:             
            rec_score[all_paper_files[i][0]] = 0
        
    # multiple keys have the same value    
    result_unordered = list((key, value) for key, value in rec_score.items() \
            if value in sorted(set(rec_score.values()), reverse=True)[:n_most])
    result = sorted(result_unordered, key=lambda x: x[1], reverse=True)
    
    return result    

In [22]:
print("5 recommended papers for researcher y3: {}".format(n_most_similar_papers_selected_researcher(5, 'y3')))

5 recommended papers: [('P00-1017', 1509.163), ('P00-1042', 1375.749), ('P00-1027', 1225.655), ('P00-1011', 1025.755), ('P00-1004', 873.019)]
