In [7]:
import pickle
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
# import these modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
 
ps = PorterStemmer()

In [3]:
class Paper:
    def __init__(self, url, title, author, published, refs):
        self.url = url
        self.title = title
        self.author = author
        self.published = published
        self.refs = refs
    
with open('/home/admin/PSP/backend/pdfs/papers_stucture_recommender.pickle', 'rb') as handle:
    papers = pickle.load(handle)[:1000]

In [4]:
class PaperWithStems(Paper):
    def __init__(self, p):
        super().__init__(p.url, p.title, p.author, p.published, p.refs)
        self.stems = set(ps.stem(w) for w in p.title.split())
        self.refs = [
            set(ps.stem(w) for w in ref.split())
            for ref in p.refs
        ]

In [8]:
papers_with_stems = [PaperWithStems(p) for p in tqdm(papers)]


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [9]:
# def similar(stems, refs):
#     for ref in refs:
#         if len(stems & ref) / len(stems) > 0.98:
#             return True
#     return False

def similar(stems, refs):
    for ref in refs:
        if len(stems & ref) == len(stems):
            return True
    return False



In [17]:
import numpy as np
from scipy.sparse.linalg import svds
from annoy import AnnoyIndex


def build_similarity_matrix(papers):
    """Создает матрицу похожести статей."""
    n = len(papers)
    matrix = np.zeros((n, n), dtype=np.float32)
    
    for i in tqdm(range(n)):
        for j in range(n):
            if i != j:
                if similar(papers[i].stems, papers[j].refs):
                    matrix[i, j] = 1
    
    return matrix

def filter_empty_rows_and_cols(matrix, papers):
    """Удаляет пустые строки и столбцы из матрицы."""
    non_zero_rows = np.any(matrix > 0, axis=1)
    non_zero_cols = np.any(matrix > 0, axis=0)
    filtered_matrix = matrix[np.ix_(non_zero_rows, non_zero_cols)]
    filtered_papers = [p for i, p in enumerate(papers) if non_zero_rows[i]]
    paperid2filtered_paperid = {
        paperid: filtered_papers.index(paper) 
        for paperid, paper in enumerate(papers)
        if paper in filtered_papers}

    return filtered_matrix, filtered_papers, paperid2filtered_paperid

def build_annoy_index(matrix, num_trees=10):
    """Строит Annoy-индекс на основе SVD-разложения матрицы."""
    u, s, vt = svds(matrix, k=min(30, min(matrix.shape)-1))  # SVD
    embeddings = u @ np.diag(s)  # Проекция на уменьшенное пространство
    
    dim = embeddings.shape[1]
    annoy_index = AnnoyIndex(dim, metric='euclidean')
    
    for i, vec in enumerate(embeddings):
        annoy_index.add_item(i, vec)
    
    annoy_index.build(num_trees)
    return annoy_index

def get_similar_articles(annoy_index, article_index, top_n=5):
    """Получает индексы похожих статей из Annoy-индекса."""
    return annoy_index.get_nns_by_item(article_index, 1 + top_n)[1:]



In [None]:
# Пример использования
similarity_matrix = build_similarity_matrix(papers_with_stems)
similarity_matrix.sum().sum()


In [21]:
filtered_matrix, filtered_articles, paperid2filtered_paperid = filter_empty_rows_and_cols(similarity_matrix, papers_with_stems)
filtered_paper2idpaperid = {
    filtered_paperid: paperid for
    paperid, filtered_paperid in paperid2filtered_paperid.items()
}


In [23]:
annoy_index = build_annoy_index(filtered_matrix)

In [67]:
annoy_index.save('papers.ann')

True

In [26]:


# Получаем похожие статьи для первой статьи в отфильтрованном списке
paperid = 101
filtered_paperid = paperid2filtered_paperid[paperid]

print(filtered_articles[filtered_paperid].title)
similar_indices = get_similar_articles(annoy_index, filtered_paperid)
print('='*90)
print("Похожие статьи:", "\n".join([filtered_articles[i].title for i in similar_indices]))

Graph Factorization Machines for Cross-Domain Recommendation
Похожие статьи: The topological face of recommendation: models and application to bias detection
Large Language Models as Recommender Systems: A Study of Popularity Bias
Cross-domain recommendation via user interest alignment
Improving Rating and Relevance with Point-of-Interest Recommender System
Distilling Structured Knowledge into Embeddings for Explainable and Accurate Recommendation


In [27]:
print("Похожие статьи:", "\n".join([papers_with_stems[filtered_paper2idpaperid[i]].title for i in similar_indices]))

Похожие статьи: The topological face of recommendation: models and application to bias detection
Large Language Models as Recommender Systems: A Study of Popularity Bias
Cross-domain recommendation via user interest alignment
Improving Rating and Relevance with Point-of-Interest Recommender System
Distilling Structured Knowledge into Embeddings for Explainable and Accurate Recommendation


In [31]:
with open('/home/admin/PSP/backend/pdfs/filtered_paper2idpaperid.pickle', 'wb') as handle:
    pickle.dump(filtered_paper2idpaperid, handle)

In [36]:
papers_with_stems[0].title

'Overhead-free User-side Recommender Systems'

In [None]:
self.url = url
        self.title = title
        self.author = author
        self.published = published

In [42]:
import urllib.parse

def generate_citation_link(title: str, authors: list):
    formatted_authors = authors
    citation = f"{formatted_authors}. {title}."
    
    return f"https://example.com/citation?{urllib.parse.urlencode({'cite': citation})}"

In [41]:
generate_citation_link(papers_with_stems[0].title, papers_with_stems[0].author)

'https://example.com/citation?cite=%5Barxiv.Result.Author%28%27Ryoma+Sato%27%29%5D.+Overhead-free+User-side+Recommender+Systems.'

In [48]:
papers_with_stems[0].published.year

2024

In [55]:
f"{str(papers_with_stems[1].author[0])} et al. {papers_with_stems[1].published.year}. {papers_with_stems[0].title}"

'Kexin Yin et al. 2022. Overhead-free User-side Recommender Systems'

In [83]:
papers = [
    {
        'author': str(paper.author[0]),
        'year': paper.published.year,
        'title': paper.title,
        'stems': paper.stems
    }
    for paper in papers_with_stems
]

len(papers)
# with open('/home/admin/PSP/backend/pdfs/papers.pickle', 'wb') as handle:
#     pickle.dump(papers, handle)

In [86]:
papers[0]

{'author': 'Ryoma Sato',
 'year': 2024,
 'title': 'Overhead-free User-side Recommender Systems',
 'stems': {'overhead-fre', 'recommend', 'system', 'user-sid'}}

In [99]:
class PapersHandler:

    def __init__(self):

        with open('/home/admin/PSP/backend/pdfs/filtered_paper2idpaperid.pickle', 'rb') as handle:
            self.filtered_paper2idpaperid = pickle.load(handle)
        with open('/home/admin/PSP/backend/pdfs/paperid2filtered_paperid.pickle', 'rb') as handle:
            self.paperid2filtered_paperid = pickle.load(handle)
        with open('/home/admin/PSP/backend/pdfs/papers.pickle', 'rb') as handle:
            self.papers = pickle.load(handle)

        f = 30
        self.annoy_index = AnnoyIndex(f, 'euclidean')
        self.annoy_index.load('/home/admin/PSP/backend/pdfs/papers.ann') # super fast, will just mmap the file
    
    def get_similar_articles(self, paper_id, top_n=5):

        filtered_paperid = self.paperid2filtered_paperid[paper_id]
        filtered_paperids = self.annoy_index.get_nns_by_item(filtered_paperid, 1 + top_n)[1:]
        paperids = [
            self.filtered_paper2idpaperid[filtered_paperid] 
            for filtered_paperid in filtered_paperids]
        
        return [
            {
                'id': paperid,
                'title': self.papers[paperid]['title'],
                'link': self.generate_citation_link(self.papers[paperid])
            }
            for paperid in paperids
        ]
        
    def search(self, query: str):
        query_stems = set(ps.stem(w) for w in query.split())
        results = []
        for paperid, filtered_paperid in paperid2filtered_paperid.items():
            paper = self.papers[paperid]
            if len(paper['stems'] & query_stems)> 0:
                results.append({
                    'id': paperid,
                    'title': paper['title']
                })

        return results

    def generate_pdf_url(self, id):
        return f'http://localhost:5000/article/{id}'

    def generate_citation_link(self, paper):
        return f"{paper['author']} et al. {paper['year']}. {paper['title']}"
        



In [101]:
ph = PapersHandler()
ph.search('fair')

[{'id': 19, 'title': 'A Survey on the Fairness of Recommender Systems'},
 {'id': 23,
  'title': 'Multi-stakeholder Recommendation and its Connection to Multi-sided Fairness'},
 {'id': 108,
  'title': 'The Connection Between Popularity Bias, Calibration, and Fairness in Recommendation'},
 {'id': 121,
  'title': 'Towards Communication Efficient and Fair Federated Personalized Sequential Recommendation'},
 {'id': 148,
  'title': 'Personalized Counterfactual Fairness in Recommendation'},
 {'id': 157, 'title': 'Fairness of Exposure in Dynamic Recommendation'},
 {'id': 179, 'title': 'Towards Fair Conversational Recommender Systems'},
 {'id': 194, 'title': 'Towards Long-term Fairness in Recommendation'},
 {'id': 214,
  'title': 'Transparency, Privacy, and Fairness in Recommender Systems'},
 {'id': 259,
  'title': 'The Impact of Popularity Bias on Fairness and Calibration in Recommendation'},
 {'id': 262,
  'title': 'A General Framework for Fairness in Multistakeholder Recommendations'},
 {'id

In [81]:
# with open('/home/admin/PSP/backend/pdfs/ph.pickle', 'wb') as handle:
#     pickle.dump(ph, handle)