# Researcher Paper Recommender #

## Prepare Tools ##

In [182]:
# utility
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import itertools

import os
from fnmatch import fnmatch
from pathlib import PurePath

import urllib.request # download files 
import subprocess # operating system command

########

from sklearn.model_selection import train_test_split
########

## Prepare Data ##

### Download & unzip research paper dataset ###

In [183]:
# set dataset path
working_dir = "/home/lee/Documents/Datasets for GitHub/scholarly_paper_recommendation/"

# download file
url = 'http://www.comp.nus.edu.sg/%7Esugiyama/SchRecData/20100825-SchPaperRecData.zip'  
urllib.request.urlretrieve(url, working_dir+'20100825-SchPaperRecData.zip')

# unzip file
subprocess.run(["unzip", working_dir+"20100825-SchPaperRecData.zip", "-d", working_dir])  

del url

KeyboardInterrupt: 

### Load all the vectors that each represents a candidate paper to recommend ###

In [386]:
def all_file_names_in_list(folder_name, pattern):
    all_rec_name = []
    for path, subdirs, files in os.walk(folder_name):
        for name in files:
            if fnmatch(name, pattern):
                all_rec_name.append(PurePath(path, name))
                
    return all_rec_name

In [None]:
# list of the full path of all files in the paper directory
all_paper_files = all_file_names_in_list(working_dir+'20100825-SchPaperRecData/RecCandidatePapersFV/', "*_recfv.txt")
# list, convert PurePath object to string
all_paper_files = [str(x) for x in all_paper_files]
# list of tuples, [(paper id like "P00-1001", paper file full path), (paper id like "P00-1002", paper file full path)]
all_paper_files = list(zip([x[117:125] for x in all_paper_files], all_paper_files))

"""structure of all_df_original (which is a list):
df[0] is a dataframe with 2 columns
token tfidf_weight
AAA 0.01
"""
all_df_original = [pd.read_csv(file, names=['token', 'tfidf_weight'], \
                 delim_whitespace=True, index_col=False, header=None) for file in list(zip(*all_paper_files))[1]]

""" list
dataframe with 1 column, token is column name
AAA
0.01
"""
all_rec_paper = [x.set_index('token').transpose().rename_axis('', axis="columns").reset_index(drop=True)\
                     for x in all_df_original]

del all_df_original

### Load all the vectors that each represents a researcher's interest ###

Here a researcher's past published papers represent his or her research interest.  

In [None]:
# list of the full path of all files in the junior researcher directory
all_researcher_files = all_file_names_in_list(working_dir+'20100825-SchPaperRecData/JuniorR/', "*-?_fv.txt")
# list, convert PurePath object to string
all_researcher_files = [str(x) for x in all_researcher_files]
# list of tuples, the regex extracts the researcher id from path,
# then [(researcher id like "y1", paper file full path), (researcher id like "y2", paper file full path)]
all_researcher_files = list(zip([re.findall(re.escape(working_dir+"20100825-SchPaperRecData/JuniorR/")\
                                            +"([^/]+)"+re.escape("/"), x)[0] for x in all_researcher_files],\
                                all_researcher_files))

"""structure of all_df_original (which is a list):
df[0] is a dataframe with 2 columns
token tfidf_weight
AAA 0.01
"""
all_researcher_original = [pd.read_csv(file, names=['token', 'tf_weight'], \
                 delim_whitespace=True, index_col=False, header=None) for file in list(zip(*all_researcher_files))[1]]
""" list
dataframe with 1 column, token is column name
AAA
0.01
"""
all_researcher_paper = [x.set_index('token').transpose().rename_axis('', axis="columns").reset_index(drop=True)\
                     for x in all_researcher_original]

del all_researcher_original

## Make Recommendations ##

### Helper ###

In [397]:
# def magnitude(v):
#     return round(np.sqrt(sum([a * a for a in v])), 3)

def cosine_similarity(x, y):
    """calculate cosine similarity based on formula from wikipedia
    https://en.wikipedia.org/wiki/Cosine_similarity
    """
#     numerator = sum(a * b for a, b in zip(x, y))
#     denominator = square_rooted(x) * square_rooted(y)
    numerator =  np.sqrt(x.dot(y))
    denominator = np.linalg.norm(x) * np.linalg.norm(y)
    
    try:
        cosine_similarity = round(numerator/float(denominator), 3)
    except ZeroDivisionError:
        cosine_similarity = 0

    return cosine_similarity

### Calculate similarity scores between a researcher's interest and all candidate papers ###

In [396]:
def n_most_similar_papers_selected_researcher(n_most, researcher):
    rec_score = dict()
    
    # find where the selected researcher is in the list all_researcher_paper
    researcher_index = list(zip(*all_researcher_files))[0].index(researcher)
    
    for i in range(len(all_rec_paper)):
        together = pd.concat([all_rec_paper[i], all_researcher_paper[researcher_index]], axis=0, join='inner')    
        rec_score[all_paper_files[i][0]] = cosine_similarity(together.iloc[0, :], together.iloc[1, :])
        
    # multiple keys have the same value    
    return {key: value for key, value in rec_score.items() \
            if value in sorted(set(rec_score.values()), reverse=True)[:n_most]}

{'P01-1063': 1.329, 'P00-1079': 1.271, 'P00-1049': 1.331, 'P01-1062': 1.258, 'P04-1056': 1.19}


Let's pick researcher "y3" and give 5 most similar papers:

In [None]:
print(n_most_similar_papers_selected_researcher(5, 'y3'))

Comment: 
The most similar paper may not be a good one to recommend as it might involve the targeted researcher already.

In [63]:
# "corpus" is a set of all the words that ever appear in any papers
# corpus = set(itertools.chain.from_iterable([list(a['token']) for a in df[0:10]]))

In [189]:
# """structure of all_tokens (a list):
# all_tokens[0]
# ['AAA', 'BBB']
# """
# each_tokens = [list(one_paper_token['token'].astype('str')) for one_paper_token in all_df_original]

# """structure of all_docs (a list):
# all_docs[0]
# ['AAA' 'BBB']
# """
# all_docs = [' '.join(x) for x in each_tokens]

In [252]:
# """structore of all_tokens_matrix (a dataframe):
# AAA BBB CCC (column names)
# 0 1 0 (whether that word shows in paper #1)
# 1 1 0 (whether that word shows in paper #2)
# """
# vec = CountVectorizer()
# corpus_matrix = vec.fit_transform(all_docs)
# all_tokens_matrix = pd.DataFrame(corpus_matrix.toarray(), columns=vec.get_feature_names()).rename_axis('', axis='rows') # note this is a dataframe