In [1]:
import os
import pandas as pd
import glob
from typing import Union, Type

In [2]:
class Paper():

    def __init__(self, 
                 paper_loc: str, 
                 datasave_loc: str, 
                 notes_loc: str) -> None:
        self.paper_loc = paper_loc
        self.datasave_loc = datasave_loc
        self.notes_loc = notes_loc
        self.name = None # paper title
        self.bibtex = None
        self.category = None
        self.update_notes()
        self.keywords = []
        self.active_attr_list = [] # without notes for simplicity since self.notes contains many notes

    def update_notes(self) -> None:
        if not os.path.exists(self.notes_loc):
            with open(self.notes_loc, 'w'):
                pass
        with open(self.notes_loc, 'r') as file:
            notes = file.read()
        notes = notes.split("THIS IS A SPLIT LINE\n")
        self.notes = [note for note in notes if note.strip() != '']

    def set_name(self, name: str) -> None:
        # paper title
        self.name = name
        if not 'name' in self.active_attr_list:
            self.active_attr_list += ['name']

    def set_category(self, cat: str) -> None:
        # paper category like survey
        safe_cat = ['survey']
        if cat in safe_cat:
            self.category = cat
            if not 'category' in self.active_attr_list:
                self.active_attr_list += ['category']
        else:
            raise ValueError(f'{cat} is not a valid cateogry type. Eg, use survey')

    def add_keyword(self, keyword: str) -> None:
        if not keyword in self.keywords:
            self.keywords.append(keyword)
        else:
            print(f'No Action: the keyword {keyword} has already been added.')
        if not 'keywords' in self.active_attr_list:
            self.active_attr_list += ['keywords']

    def set_bibtex(self, bibtex: str) -> None:
        self.bibtex = bibtex
        if not 'bibtex' in self.active_attr_list:
            self.active_attr_list += ['bibtex']
        # self._bibtex2attr()

    def _bibtex2attr(self):
        raise NotImplementedError
    
    def add_relation(self, another_paper: str, relation: str, note: str) -> None:
        with open(self.notes_loc, 'a') as file:
            file.write("THIS IS A SPLIT LINE\n")
            file.write(f'RELATION {relation} to {another_paper}: {note}\n')
            file.write("THIS IS A SPLIT LINE\n")
        self.update_notes()
    
    def data_save(self) -> None:
        # replace local csv data with class data, the opposite of data_update
        df_data = []
        for attr in self.active_attr_list:
            if attr == 'keywords':
                df_data.append(['keywords', ','.join(self.keywords)])
            else:
                df_data.append([attr, getattr(self, attr)])
        df = pd.DataFrame(df_data, columns=['attribute name', 'attribute data'])
        df.to_csv(self.datasave_loc, index=False)
    
    def data_update(self) -> None:
        # replace class data with local csv data, the opposite of data_update
        if not os.path.exists(self.datasave_loc):
            df = pd.DataFrame([], columns=['attribute name', 'attribute data'])
            df.to_csv(self.datasave_loc, index=False)
        else:
            safe_attr_list = ['name', 'bibtex', 'keywords', 'category']
            df = pd.read_csv(self.datasave_loc)
            for _, row in df.iterrows():
                attr = row['attribute name']
                info = row['attribute data']
                assert attr in safe_attr_list, f'{attr} is not a legal property for a paper'
                if attr == 'keywords':
                    self.keywords = info.split(',')
                else:
                    setattr(self, attr, info)
                self.active_attr_list.append(attr)

    def show_notes(self) -> None:
        self.update_notes()
        if self.notes == []:
            print('No notes for this paper')
        for i, note in enumerate(self.notes):
            print(f'{i+1}: {note}')

    def show_keywords(self) -> None:
        print('Keywords: ' + ','.join(self.keywords))
    
    def __str__(self) -> str:
        info = f"""
                Paper Title: {'[' + self.category + ']' if self.category else ''}{self.name}\n\
                Paper Location: {self.paper_loc}\n
                """
        self.update_notes()
        if self.notes != []:
            info += "Notes: " + "\n".join(self.notes)
        if self.keywords != []:
            info += "Keywords: " + ', '.join(self.keywords) + '\n'
        return info

In [3]:
class CollectionOfPapers():

    def __init__(self, collection_loc: str) -> None:
        self.paper_no = 0
        self.paper_dict = {}

        if os.path.exists(collection_loc):
            self.collection_loc = collection_loc
            self.papers_loc = collection_loc + '\\papers\\'
            self.notes_loc = collection_loc + '\\notes\\'
            self.papers_data_loc = collection_loc + '\\data\\'
        else:
            raise ValueError('This path does not exist for CollectionOfPapers to set up.')
        
        if not os.path.exists(self.papers_loc):
            os.makedirs(self.papers_loc)
        if not os.path.exists(self.notes_loc):
            os.makedirs(self.notes_loc)
        if not os.path.exists(self.papers_data_loc):
            os.makedirs(self.papers_data_loc)

    def add_paper(self, pdf_name: str) -> None:
        if not pdf_name.endswith('.pdf'):
            pdf_name += '.pdf'
        if os.path.exists(self.papers_loc + pdf_name):
            self.paper_no += 1
            os.rename(self.papers_loc + pdf_name, self.papers_loc + f'{self.paper_no}.pdf')
            self.paper_dict[self.paper_no] = Paper(self.papers_loc + f'{self.paper_no}.pdf', 
                                                   self.papers_data_loc + f'{self.paper_no}.csv',
                                                   self.notes_loc + f'{self.paper_no}.txt')
            print(f'Added pdf with name {pdf_name} into the CollectionOfPapers dataset. Paper ID {self.paper_no}')
        else:
            raise ValueError(f'This pdf does not exist under the path {self.papers_loc}.')
        
    def auto_add_papers(self, summary: bool = False) -> None:
        pdf_names = [os.path.basename(file_path) for file_path in glob.glob(os.path.join(self.papers_loc, '*'))]

        while (str(self.paper_no + 1) + '.pdf') in pdf_names:
            pdf_names.remove(str(self.paper_no + 1) + '.pdf')
            self.paper_no += 1
            if not f'{self.paper_no}' in self.paper_dict:
                self.paper_dict[self.paper_no] = Paper(self.papers_loc + f'{self.paper_no}.pdf', 
                                                    self.papers_data_loc + f'{self.paper_no}.csv',
                                                    self.notes_loc + f'{self.paper_no}.txt')
                if summary:
                    print(f'Added pdf with name {self.paper_no + 1}.pdf into the CollectionOfPapers dataset.')
            
        for pdf_name in pdf_names:
            if pdf_name.endswith('.pdf'):
                self.paper_no += 1
                os.rename(self.papers_loc + pdf_name, self.papers_loc + f'{self.paper_no}.pdf')
                self.paper_dict[self.paper_no] = Paper(self.papers_loc + f'{self.paper_no}.pdf', 
                                                        self.papers_data_loc + f'{self.paper_no}.csv',
                                                        self.notes_loc + f'{self.paper_no}.txt')
                if summary:
                    print(f'Added pdf with name {pdf_name} into the CollectionOfPapers dataset.')

    def add_category(self, indices: Union[list[int], int], cat: str):
        if isinstance(indices, int):
            paper = self.paper_dict[indices]
            paper.set_category(cat)
        else:
            for index in indices:
                paper = self.paper_dict[index]
                paper.set_category(cat)

    def add_relation(self, relations: list[list[int, int, bool, str, str]]):
        for paper1_id, paper2_id, mutual, relation_type, note in relations:
            paper1: Type[Paper] = self.paper_dict[paper1_id]
            paper2: Type[Paper] = self.paper_dict[paper2_id]
            paper1.add_relation(paper2_id, relation_type, note)
            if mutual:
                paper2.add_relation(paper1_id, relation_type, note)
            else:
                paper2.add_relation(paper1_id, 'BE ' + relation_type, note)

    def add_keyword(self, indices: Union[list[int], int], keyword: str):
        if isinstance(indices, int):
            paper = self.paper_dict[indices]
            paper.add_keyword(keyword)
        else:
            for index in indices:
                paper = self.paper_dict[index]
                paper.add_keyword(keyword)

    def data_save(self):
        for paper in self.paper_dict.values():
            paper.data_save()

    def data_update(self):
        for paper in self.paper_dict.values():
            paper.data_update()

    def __str__(self, full: bool = False) -> str:
        info = ''
        for i in range(1, self.paper_no + 1):
            if full:
                info += f'{i}: \n{self.paper_dict[i].__str__()}'
            else:
                info += f'{i}: Paper Title: {self.paper_dict[i].name}\n'
            if not 'bibtex' in self.paper_dict[i].active_attr_list:
                info += f'Warning: This paper does not have a bibtex.\n'
        return info

In [4]:
collection_loc = 'c:\\Users\\sunsh\\OneDrive\\桌面\\MSc_master_project\\paper_organizer_database'
paper_collection = CollectionOfPapers(collection_loc)
paper_collection.auto_add_papers(summary=True)
paper_collection.data_update()
print(paper_collection)

Added pdf with name 2.pdf into the CollectionOfPapers dataset.
Added pdf with name 3.pdf into the CollectionOfPapers dataset.
Added pdf with name 4.pdf into the CollectionOfPapers dataset.
Added pdf with name 5.pdf into the CollectionOfPapers dataset.
Added pdf with name 6.pdf into the CollectionOfPapers dataset.
Added pdf with name 7.pdf into the CollectionOfPapers dataset.
Added pdf with name 8.pdf into the CollectionOfPapers dataset.
Added pdf with name 9.pdf into the CollectionOfPapers dataset.
Added pdf with name 10.pdf into the CollectionOfPapers dataset.
Added pdf with name 11.pdf into the CollectionOfPapers dataset.
Added pdf with name 12.pdf into the CollectionOfPapers dataset.
Added pdf with name 13.pdf into the CollectionOfPapers dataset.
1: Paper Title: Deep reinforcement and transfer learning for abstractive text summarization: A review
2: Paper Title: Automatic text summarization: A comprehensive survey
3: Paper Title: Automatic summarization of scientific articles: A sur

User guide:

`collection.add_paper(pdf_name: str)`

`paper.set_name(name: str)`

`paper.set_bibtex(bibtex: str)`

`paper.data_save()`

Note text split by 'THIS IS A SPLIT LINE\n'

In [5]:
paper_collection.add_category([1,2,3,4,5,6,7,8,12], 'survey')
paper_collection.add_keyword([9,10,11], 'graph-based')
paper_collection.add_keyword([10,11], 'biomedical')
paper_collection.add_keyword(9, 'SOTA')

No Action: the keyword graph-based has already been added.
No Action: the keyword graph-based has already been added.
No Action: the keyword graph-based has already been added.
No Action: the keyword biomedical has already been added.
No Action: the keyword biomedical has already been added.
No Action: the keyword SOTA has already been added.


In [6]:
paper_collection.add_relation([[7, 9, False, 'cite', 'cite in section 6.3'],
                               [7, 10, False, 'cite', 'cite in section 6.3'],
                               [7, 11, False, 'cite', 'cite in section 6.3']])

In [10]:
print(paper_collection.__str__(full=True))

1: 

                Paper Title: [survey]Deep reinforcement and transfer learning for abstractive text summarization: A review
                Paper Location: c:\Users\sunsh\OneDrive\桌面\MSc_master_project\paper_organizer_database\papers\1.pdf

                2: 

                Paper Title: [survey]Automatic text summarization: A comprehensive survey
                Paper Location: c:\Users\sunsh\OneDrive\桌面\MSc_master_project\paper_organizer_database\papers\2.pdf

                3: 

                Paper Title: [survey]Automatic summarization of scientific articles: A survey
                Paper Location: c:\Users\sunsh\OneDrive\桌面\MSc_master_project\paper_organizer_database\papers\3.pdf

                4: 

                Paper Title: [survey]Review of automatic text summarization techniques & methods
                Paper Location: c:\Users\sunsh\OneDrive\桌面\MSc_master_project\paper_organizer_database\papers\4.pdf

                5: 

                Paper Title: [survey]Mu

In [9]:
paper_collection.data_update()