In [30]:
from fuzzywuzzy import process
import pickle
import pandas as pd
import numpy as np
import json
import datetime
import random
import unidecode

In [31]:
data_original = pd.read_csv('./pickles/data_pd.csv') #Original Data
topics = pickle.load(file=open('./pickles/mallet_learnt_topics.pickle','rb')) #Words for each topic
data_cleaned = pickle.load(file=open('./pickles/mallet_pruned.pickle', 'rb')) #Feature vector for each PaperID
user_map = pickle.load(file = open('./pickles/user_mappings.pickle', 'rb')) #Author Names -> PaperIDs
user_map_name = pickle.load(file = open('./pickles/auth_paper_title.pickle', 'rb')) #Author Names -> Paper Titles
user_research = pickle.load(file = open('./pickles/user_research_data.pickle', 'rb')) #Feature vector for each UserID
user_id, id_user = pickle.load(file = open('./pickles/user_id_maps.pickle', 'rb')) #Name -> UserID, UserID -> Name
paper_auth_id = pickle.load(file = open('./pickles/paper_auth_id.pickle', 'rb')) #PaperID -> AuthorID
centers, k_ass, k_ass_rev = pickle.load(file = open('./pickles/kmeans_save.pickle', 'rb')) #KMeans centres
paper_auth_name = pickle.load(file = open('./pickles/paper_auth_name.pickle', 'rb')) #PaperID -> Author Name

In [32]:
pickle.dump(file=open('./pickles/scholarly_id_paper_title.pickle', 'wb'), obj= data_original['title'].values)

In [33]:
pickle.dump(file=open('./pickles/scholarly_id_user_name.pickle', 'wb'), obj=id_user)

In [34]:
data_original.head()

Unnamed: 0.1,Unnamed: 0,title,summary,date,authors,tags,id
0,0,Minimax deviation strategies for machine learn...,The article is devoted to the problem of small...,1500196508,"['Michail Schlesinger', 'Evgeniy Vodolazskiy']",['cs.LG'],1707.04849v1
1,1,mlbench: How Good Are Machine Learning Clouds ...,We conduct an empirical study of machine learn...,1501365558,"['Hantian Zhang', 'Luyuan Zeng', 'Wentao Wu', ...","['cs.DC', 'cs.LG', 'stat.ML']",1707.09562v2
2,2,Introduction to Machine Learning: Class Notes ...,Introduction to Machine learning covering Stat...,1240486857,['Amnon Shashua'],['cs.LG'],0904.3664v1
3,3,AutoCompete: A Framework for Machine Learning ...,"In this paper, we propose AutoCompete, a highl...",1436368059,"['Abhishek Thakur', 'Artus Krohn-Grimberghe']","['stat.ML', 'cs.LG']",1507.02188v1
4,4,Joint Training of Deep Boltzmann Machines,We introduce a new method for training deep Bo...,1355277567,"['Ian Goodfellow', 'Aaron Courville', 'Yoshua ...","['stat.ML', 'cs.LG']",1212.2686v1


In [35]:
def transform_name_id(name):
    name = unidecode.unidecode(name)
    surname = name[name.rfind(' ')+1:]
    surname = surname.replace('-', '_')
    first_name = name[0]
    return surname + '_'+ first_name

In [37]:
transform_name_id('Otte Heinävaara')

'Heinavaara_O'

In [41]:
user_id_names = list(user_id.keys())
user_id_type = ['author' for name in user_id_names]
user_id_ids = [user_id[name] for name in user_id_names]
user_id_detail_id = [user_map[name] for name in user_id_names]
user_id_detail_name = [user_map_name[name] for name in user_id_names]
user_arxiv_id = [transform_name_id(name) for name in user_id_names]
paper_titles = data_original['title'].values
paper_arxiv_id = data_original['id'].values
paper_type = ['paper' for paper in paper_titles]
paper_id = list(range(len(paper_titles)))
paper_id_detail_id = paper_auth_id
paper_id_detail_name = paper_auth_name
paper_date = data_original['date'].values
col_names = np.array(list(user_id_names) + list(paper_titles)).reshape(-1, 1)
col_ids = np.array(user_id_ids + paper_id).reshape(-1, 1)
col_type = np.array(user_id_type + paper_type).reshape(-1, 1)
col_detail_id = np.array(user_id_detail_id + paper_id_detail_id).reshape(-1, 1)
col_detail_name = np.array(user_id_detail_name + paper_id_detail_name).reshape(-1, 1)
col_date = np.array(list([-1]*len(user_id_names)) + list(paper_date)).reshape(-1, 1)
col_arxiv = np.array(list(user_arxiv_id) + list(paper_arxiv_id)).reshape(-1,1)
search_key = dict(zip(list(list(user_id_names) + list(paper_titles)), list(range(len(col_names)))))

In [42]:
search_matrix = (len(user_id_names), np.squeeze(np.stack((col_names,col_type, col_ids, col_detail_id, col_detail_name, col_date, col_arxiv), axis = 1)))

In [43]:
pickle.dump(file = open('./pickles/search_array.pickle', 'wb'), obj=search_matrix)

In [44]:
import numpy as np
import random
import datetime

def autocomplete_search(term, matrix, search_indx, limit = 100):
    op = {}
    op_papers = []
    term = term.strip()
    term_min = 3
    show_paper = True
    show_author = True
    filter_surname = False
    sn_chk = "?surname="
    paper_chk = "?paper"
    author_chk = "?author"
    surname = ""
    if paper_chk in term:
        show_author = False
        term = term.replace(paper_chk, '').strip()
    if sn_chk in term and ';' in term:
        surname = term.lower()[term.find(sn_chk)+len(sn_chk):term.find(';', term.find(sn_chk))].strip()
        term = term.replace(term[term.find(sn_chk):term.find(';', term.find(sn_chk))+1], "").strip()
        filter_surname = True
    if author_chk in term:
        show_paper = False
        term = term.replace(author_chk, '').strip()
    if len(term) < term_min and not filter_surname: 
        return {}
    for idx, val in np.ndenumerate(matrix[:, search_indx]):
        row = matrix[idx, :]
        val = val.replace("\n", "").replace("  ", " ")
        if len(term) >= term_min and (term in val or term.lower() in val.lower()):
            row[0][search_indx] = val.strip()
            if row[0][1] == 'author' and show_author:
                if filter_surname:
                    if surname.lower() in val.split()[-1].lower():
                        op[int(np.squeeze(idx))] = row
                else:
                    op[int(np.squeeze(idx))] = row
            elif row[0][1] == 'paper' and show_paper:
                if filter_surname:
                    for auth in row[0][4]:
                        if surname.lower() in auth.split()[-1].lower():
                            op_papers.append((int(np.squeeze(idx)), row))
                            break
                else:
                    op_papers.append((int(np.squeeze(idx)), row))
        elif row[0][1] == 'paper' and filter_surname and len(term.strip()) == 0:
                for auth in row[0][4]:
                    if surname in auth.split()[-1].lower():
                        op_papers.append((int(np.squeeze(idx)), row))
                        break
        if len(op.keys()) + len(op_papers) > limit:
            break
    if len(op_papers) + len(op.keys()) == 0:
        return []
    results = {"results":{"category1":{"name":"Authors", "results":[]}, "category2":{"name": "Papers", "results":[]}}}
    for op_key in op:
        in_data = op[op_key][0]
        desc = in_data[4]
        title = in_data[0]
        url = ""
        if in_data[1] == 'author':
            title = title.title()
            desc = str(len(in_data[3])) + " paper(s) including: '" + random.choice(in_data[4]).replace("\n", "").replace("  ", " ").strip() + "'"
            url = "/author/" + str(in_data[2])
            entry = {"title":title, "description":desc, "url":url}
            results['results']['category1']['results'].append(entry)
        else: 
            pass
    #Sort list of papers
    op_papers = sorted(op_papers, key=lambda k: k[1][0][5], reverse= True)
    for paper in op_papers:
        in_data = paper[1][0]
        desc = in_data[4]
        title = in_data[0].replace("\n", "").replace("  ", " ")
        if in_data[1] == 'paper':
            months = ["Unknown", "January", "Febuary", "March", "April", "May","June",
                      "July", "August", "September", "October", "November", "December"]
            date = datetime.datetime.utcfromtimestamp(in_data[5])
            desc = ', '.join(desc)
            desc = '(' + months[date.month] + ' '  + str(date.year) +') ' + desc
            url = "/paper/" + str(in_data[2])
            if len(desc) > 70:
                desc = desc[:desc.find(', ',40)] + " and others"
            entry = {"title":title, "description":desc, "url":url}
            results['results']['category2']['results'].append(entry)
    return results

In [45]:
Ensemble of Generative and Discriminative Techniques for Sentiment  Analysis of Movie Reviews
Ensemble of Generative and Discriminative Techniques for Sentiment Analysis of Movie Reviews
Ensemble of Generative and Discriminative Techniques for Sentiment Analysis of Movie Reviews

SyntaxError: invalid syntax (<ipython-input-45-f43e13c6848f>, line 1)

In [46]:
autocomplete_search('Ensemble of Generative and Discriminative Techniques for Sentiment Analysis of Movie Reviews', search_matrix[1], 0, 40)

{'results': {'category1': {'name': 'Authors', 'results': []},
  'category2': {'name': 'Papers',
   'results': [{'description': '(December 2014) Grégoire Mesnil, Tomas Mikolov and others',
     'title': 'Ensemble of Generative and Discriminative Techniques for Sentiment Analysis of Movie Reviews',
     'url': '/paper/23709'}]}}}