In [None]:
# import necessary libraries
import numpy as np
import pandas as pd
import textdistance as td
import random

In [None]:
# import data from datasets in the csv format
auth = pd.read_csv('./datasets/authorship.csv')
conf = pd.read_csv('./datasets/conferences.csv')
papers = pd.read_csv('./datasets/papers.csv')
persons = pd.read_csv('./datasets/persons.csv')

In [None]:
# some changes to the dataframes
persons['name'] = persons['name'].str.lower()

cols = papers.columns
cols = [c for c in cols if c.startswith("Unnamed")]
papers = papers.drop(cols, axis=1)

persons = persons[persons['name'].notna()]


# the dataset doesn't have the data about whether the user has published papers in a particular journal or not
# so we are adding random 1s and 0s to the dataset, with an non-uniform probability

def oneorzero(z, margin):
    if z >= margin:
        return 1
    else:
        return 0

has_published = [oneorzero(random.uniform(0,1), 0.15) for i in range(len(conf))]
conf['has_published'] = has_published

In [None]:
# filter based on impact factor

lower_if, upper_if = -1, -1

print("Enter range of impact factor")

lower = input("Lower bound: ")
if not lower.isnumeric():
  lower_if = -1
else:
  lower_if = int(lower)

upper = input("Upper bound: ")
if not upper.isnumeric():
  upper_if = -1
else:
  upper_if = int(upper)


if lower_if == -1 and upper_if == -1:
  filter1 = conf
elif lower_if == -1:
  filter1 = conf[lambda x: x['impact'] <= upper_if]  
elif upper_if == -1:
  filter1 = conf[lambda x: x['impact'] >= lower_if]  
else:
  filter1 = conf[lambda x: x['impact'] >= lower_if]
  filter1 = conf[lambda x: x['impact'] <= upper_if]

In [None]:
# filter based on conference name

conf_id = -1

conference_name = input("Enter conference name: ")
if conference_name != "":
  filter2 = conf[lambda x: x['acronym'] == conference_name.upper()]
  if len(filter2) == 1:
    conf_id = filter2['id'].tolist()[0]

In [None]:
# filter based on publisher's name
auth_ids = []

author_name = input("Enter author name: ")

if author_name != "":
  simfact = [td.sorensen.normalized_similarity(author_name, pp) for pp in persons['name'].tolist()]
  persons['similarity_factor'] = simfact
  auth_ids = persons[lambda x: x['similarity_factor'] >= 0.5]['id'].tolist()
    
else:
  auth_ids = persons['id'].tolist()

In [None]:
# filter based on title of the paper

title = input("Enter title of the paper(or parts of it): ")

if title != "":
  simfact = [td.sorensen.normalized_similarity(title, pp) for pp in papers['title'].tolist()]
  papers['similarity_factor'] = simfact
  modpapers = papers[lambda x: x['similarity_factor'] >= 0.3]
else:
  modpapers = papers

In [None]:
# filter based on paper published year

yr_lower, yr_upper = -1, -1
print("Enter the range (in years) of the paper published: ")

yrlower = input("Lower bound: ")
if not yrlower.isnumeric():
  lower_yr = -1
else:
  lower_yr = int(yrlower)

yrupper = input("Upper bound: ")
if not yrupper.isnumeric():
  upper_yr = -1
else:
  upper_yr = int(yrupper)


if lower_yr == -1 and upper_yr == -1:
  modpapers = modpapers
elif lower_yr == -1:
  modpapers = modpapers[lambda x: x['year'] <= upper_yr]  
elif upper_yr == -1:
  modpapers = modpapers[lambda x: x['year'] >= lower_yr]  
else:
  modpapers = modpapers[lambda x: x['year'] >= lower_yr]
  modpapers = modpapers[lambda x: x['year'] <= upper_yr]

In [None]:
# filter on pages

lbp, ubp = -1, -1

print("Enter the range of pages in the paper: ")

lb = input("Lower bound: ")
if not lb.isnumeric():
  lbp = -1
else:
  lbp = int(lb)

ub = input("Upper bound: ")
if not ub.isnumeric():
  ubp = -1
else:
  ubp = int(ub)

tempdf = modpapers[lambda x: x['num_pages'] == 0]

if lbp == -1 and ubp == -1:
  modpapers = modpapers
elif lbp == -1:
  modpapers = modpapers[lambda x: x['num_pages'] <= ubp]
  modpapers = pd.concat([modpapers, tempdf])  
elif ubp == -1:
  modpapers = modpapers[lambda x: x['num_pages'] >= lbp]
  modpapers = pd.concat([modpapers, tempdf])
else:
  modpapers = modpapers[lambda x: x['num_pages'] >= lbp]
  modpapers = modpapers[lambda x: x['num_pages'] <= ubp]
  modpapers = pd.concat([modpapers, tempdf])

In [None]:
# final modifications to the data

def finalfilter(conf_id, auth_ids, modpapers):
  if conf_id == -1:
    modpapers = modpapers
  else:
    modpapers = modpapers[lambda x: x['conference_id'] == conf_id]
  
  paperids = auth[lambda x: x['person_id'].isin(auth_ids)]['paper_id'].tolist()
  finaldf = modpapers[modpapers['id'].isin(paperids)]
  return paperids,finaldf

In [None]:
# get recommendations for the user, from the journals he/she has published in

def recommend_filter(finaldf):
    conferences = conf[lambda x: x['has_published'] == 1]['id'].tolist()
    
    recommdf = finaldf[finaldf['conference_id'].isin(conferences)]
    
    return recommdf

In [None]:
pid, finaldf = finalfilter(conf_id, auth_ids, modpapers)
recommdf = recommend_filter(finaldf)

In [None]:
# prints articles which satisfy the given criteria

if len(finaldf) == 0:
    printf("No articles are found for the criteria you have searched for")

else:
    print("The following " + str(len(finaldf)) + " entries are found relevant: ")
    count = 0
    for index, row in finaldf.iterrows():
      print(str(count) + " :::: " + row['title'] + " :::: published in " + str(row['year']))
      count += 1

In [None]:
# recommends articles which satisfies the criteria and from those journals where the user has published previously

if len(recommdf) == 0:
    printf("You have not published in any of the conferences, so you don't have any recommendations")

else:
    print("These " + str(len(recommdf)) + " articles are recommended for you, since you have published in these journals: ")
    count = 0
    for index, row in recommdf.iterrows():
      print(str(count) + " :::: " + row['title'] + " :::: published in " + str(row['year']))
      count += 1

In [None]:
# print the filters used to generate the output

print("\033[1m The following rules were used to get the results:\n \033[0m")


print("\033[1m Impact factor: \033[0m")
if lower_if != -1 and upper_if == -1:
    print("Measures were not given - not considered for filtering")
elif lower_if == -1:
    print("Upper bound given: " + str(upper_if) + ". Entries lesser than the given value are considered")
elif upper_if == -1:
    print("Lower bound given: " + str(lower_if) + ". Entries greater than the given value are considered")
else:
    print("Both bounds given. Values range from " + str(lower_if) + " to " + str(upper_if))
print()    
    
    
print("\033[1m Year of publishing: \033[0m")
if lower_yr != -1 and upper_yr == -1:
    print("Measures were not given - not considered for filtering")
elif lower_yr == -1:
    print("Upper bound given: " + str(upper_yr) + ". Papers published before the given year are considered")
elif upper_yr == -1:
    print("Lower bound given: " + str(lower_yr) + ". Papers published after the given year are considered")
else:
    print("Both bounds given. Papers published between the years " + str(lower_yr) + " to " + str(upper_yr) + " are considered")
print()


print("\033[1m Number of pages in the article: \033[0m")
if lbp != -1 and ubp == -1:
    print("Measures were not given - not considered for filtering")
elif lbp == -1:
    print("Upper bound given: " + str(ubp) + ". Papers having less than given number of pages are considered")
elif upper_yr == -1:
    print("Lower bound given: " + str(lbp) + ". Papers having more than given number of pages are considered")
else:
    print("Both bounds given. Papers having pages in the range " + str(lbp) + " to " + str(ubp) + " are considered")
print()


print("\033[1m Title of the paper: \033[0m")
if title == "":
    print("Title not entered - not considered for filtering")
else:
    print("Title entered - " + str(title))
    print("All papers having similarity factor >= 0.3 with the given title are considered")
print()


print("\033[1m Conference name: \033[0m")
if conf_id == -1:
    print("Conference name not mentioned, or is invalid - not considered")
else:
    print("Conference name given - " + str(conference_name))
    print("Papers published in the given conference are considered")
print()


print("\033[1m Author name: \033[0m")
if len(auth_ids) == "":
    print("Author name not mentioned, or is invalid, or does not match the entries - not considered")
else:
    print("Author name given - " + str(author_name))
    print("All papers published by authors having similarity factor >= 0.5 with the entered name are considered by IDs")