In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import itertools
from abc import ABC, abstractmethod
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer 


In [4]:
class BaseRecommender(ABC):
    def __init__(self, name):
        self.name = name
        self.train_df = None
        
    @abstractmethod
    def fit(self, train_df):
        """Treina o modelo com os dados de treino."""
        pass
    
    @abstractmethod
    def recommend(self, author_id, top_n=10):
        """Retorna uma lista de author_ids recomendados."""
        pass

In [5]:
class TopologyRecommender(BaseRecommender):
    def __init__(self):
        super().__init__("Topology (Graph Coauthor)")
        self.graph = defaultdict(set)
        self.popular_authors = []
        
    def fit(self, train_df):
        self.train_df = train_df
        print(f"[{self.name}] Construindo grafo...")
        
        # Construção do Grafo
        for _, group in train_df.groupby('work_id'):
            authors = group['author_id'].tolist()
            if len(authors) > 1:
                for u, v in itertools.permutations(authors, 2):
                    self.graph[u].add(v)
        
        # Cálculo de Popularidade (para fallback)
        popularity_counter = Counter()
        for author, neighbors in self.graph.items():
            popularity_counter[author] = len(neighbors)
        self.popular_authors = [auth for auth, _ in popularity_counter.most_common()]
        print(f"[{self.name}] Grafo construído com {len(self.graph)} autores.")

    def recommend(self, author_id, top_n=10):
        recommendations = []
        current_coauthors = self.graph.get(author_id, set())
        
        # Lógica de Amigos em Comum (2 hops)
        if author_id in self.graph:
            candidates = []
            for neighbor in current_coauthors:
                neighbors_of_neighbor = self.graph.get(neighbor, set())
                for candidate in neighbors_of_neighbor:
                    if candidate != author_id and candidate not in current_coauthors:
                        candidates.append(candidate)
            
            recommendations = [c[0] for c in Counter(candidates).most_common(top_n)]
        
        # Fallback: Populares
        if len(recommendations) < top_n:
            for pop in self.popular_authors:
                if pop != author_id and pop not in recommendations and pop not in current_coauthors:
                    recommendations.append(pop)
                    if len(recommendations) >= top_n:
                        break
                        
        return recommendations[:top_n]


In [6]:
class ContentSciBERTRecommender(BaseRecommender):
    def __init__(self, model_name='allenai/scibert_scivocab_uncased'):
        super().__init__("Content-Based (SciBERT)")
        self.model_name = model_name
        self.encoder = None
        self.author_embeddings = {} # {author_id: vector}
        self.knn_model = None
        self.author_ids_index = [] # Para mapear índice do KNN -> author_id
        
    def fit(self, train_df):
        print(f"[{self.name}] Carregando SciBERT e gerando embeddings...")
        self.encoder = SentenceTransformer(self.model_name)
        
        unique_works = train_df[['work_id', 'title', 'abstract']].drop_duplicates()
        unique_works['text'] = unique_works['title'] + ". " + unique_works['abstract'].fillna('')
        
        work_embeddings = self.encoder.encode(unique_works['text'].tolist(), show_progress_bar=True)
        
        work_id_to_emb = {wid: emb for wid, emb in zip(unique_works['work_id'], work_embeddings)}
        
        # O perfil do autor será a média dos vetores de seus trabalhos
        print(f"[{self.name}] Criando perfis de autores...")
        
        author_groups = train_df.groupby('author_id')['work_id'].apply(list)
        
        author_vectors = []
        self.author_ids_index = []
        
        for author_id, work_ids in author_groups.items():
            vectors = [work_id_to_emb[wid] for wid in work_ids if wid in work_id_to_emb]
            
            if vectors:
                mean_vector = np.mean(vectors, axis=0)
                author_vectors.append(mean_vector)
                self.author_ids_index.append(author_id)
        
        self.author_embeddings = np.array(author_vectors)
        self.knn_model = NearestNeighbors(n_neighbors=50, metric='cosine', n_jobs=-1)
        self.knn_model.fit(self.author_embeddings)
        print(f"[{self.name}] Treinamento concluído. {len(self.author_ids_index)} perfis criados.")

    def recommend(self, author_id, top_n=10):
        try:
            author_idx = self.author_ids_index.index(author_id)
        except ValueError:
            # TODO: Retornar autores populares
            return []
            
        author_vector = self.author_embeddings[author_idx].reshape(1, -1)
        
        # Buscamos top_n + 1 porque o mais próximo é sempre ele mesmo
        distances, indices = self.knn_model.kneighbors(author_vector, n_neighbors=top_n+1)
        
        recommendations = []
        for idx in indices[0]:
            rec_author = self.author_ids_index[idx]
            if rec_author != author_id:
                recommendations.append(rec_author)
                
        return recommendations[:top_n]

In [7]:
def evaluate_models(models, test_ground_truth, train_graph_check, K_values=[5, 10]):
    results = {}
    
    for model in models:
        print(f"\nAvaliando modelo: {model.name}...")
        model_metrics = {k: {'precision': [], 'recall': []} for k in K_values}
        
        for author_id, actual_new_coauthors in test_ground_truth.items():
            max_k = max(K_values)
            recs = model.recommend(author_id, top_n=max_k)
            
            past_coauthors = train_graph_check.get(author_id, set())
            recs = [r for r in recs if r not in past_coauthors]
            
            for k in K_values:
                top_k_recs = recs[:k]
                hits = len(set(top_k_recs) & actual_new_coauthors)
                
                p = hits / k if k > 0 else 0
                r = hits / len(actual_new_coauthors) if len(actual_new_coauthors) > 0 else 0
                
                model_metrics[k]['precision'].append(p)
                model_metrics[k]['recall'].append(r)
        
        # Média final
        results[model.name] = {}
        for k in K_values:
            avg_p = np.mean(model_metrics[k]['precision'])
            avg_r = np.mean(model_metrics[k]['recall'])
            f1 = 2 * (avg_p * avg_r) / (avg_p + avg_r) if (avg_p + avg_r) > 0 else 0
            
            results[model.name][k] = {'P': avg_p, 'R': avg_r, 'F1': f1}
            print(f"  K={k}: Precision={avg_p:.4f}, Recall={avg_r:.4f}, F1={f1:.4f}")
            
    return results

In [8]:
authors_df = pd.read_csv('database/authorships.csv')
works_df = pd.read_csv('database/works.csv')

merged_df = authors_df.merge(
    works_df[['id', 'publication_date', 'title', 'abstract', 'language']], 
    left_on='work_id', right_on='id'
)
merged_df['publication_date'] = pd.to_datetime(merged_df['publication_date'], errors='coerce')
merged_df = merged_df.dropna(subset=['publication_date', 'author_id', 'title', 'abstract', 'language']).drop(columns=['id'])
merged_df = merged_df[merged_df['language'] == 'en']

unique_works = merged_df[['work_id', 'publication_date']].drop_duplicates().sort_values('publication_date')
split_idx = int(len(unique_works) * 0.8)

train_work_ids = set(unique_works.iloc[:split_idx]['work_id'])
test_work_ids = set(unique_works.iloc[split_idx:]['work_id'])

train_df = merged_df[merged_df['work_id'].isin(train_work_ids)]
test_df = merged_df[merged_df['work_id'].isin(test_work_ids)]

def build_graph(df):
    graph = defaultdict(set)
    for _, group in df.groupby('work_id'):
        authors = group['author_id'].tolist()
        
        if len(authors) > 1:
            for u, v in itertools.permutations(authors, 2):
                graph[u].add(v)

    return graph

train_graph = build_graph(train_df)
test_graph_raw = build_graph(test_df)

test_ground_truth = defaultdict(set)

for author, coauthors in test_graph_raw.items():
    # Pega quem o autor colaborou no futuro
    future_coauthors = coauthors
    
    # Remove quem ele já conhecia no passado (não é predição nova)
    past_coauthors = train_graph.get(author, set())
    new_links = future_coauthors - past_coauthors
    
    if new_links:
        test_ground_truth[author] = new_links

In [11]:
topo_model = TopologyRecommender()

models = [topo_model]

for model in models:
    model.fit(train_df) 
    
metrics = evaluate_models(models, test_ground_truth, train_graph, K_values=[5, 10, 20, 50])

[Topology (Graph Coauthor)] Construindo grafo...
[Topology (Graph Coauthor)] Grafo construído com 5320 autores.

Avaliando modelo: Topology (Graph Coauthor)...
  K=5: Precision=0.0606, Recall=0.0295, F1=0.0397
  K=10: Precision=0.0426, Recall=0.0419, F1=0.0422
  K=20: Precision=0.0290, Recall=0.0583, F1=0.0387
  K=50: Precision=0.0177, Recall=0.0835, F1=0.0292


In [12]:
content_model = ContentSciBERTRecommender(model_name='allenai/scibert_scivocab_uncased')

models = [content_model]

for model in models:
    model.fit(train_df) 
    
metrics = evaluate_models(models, test_ground_truth, train_graph, K_values=[5, 10, 20, 50])

[Content-Based (SciBERT)] Carregando SciBERT e gerando embeddings...


No sentence-transformers model found with name allenai/scibert_scivocab_uncased. Creating a new one with mean pooling.


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: allenai/scibert_scivocab_uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.decoder.bias               | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

[Content-Based (SciBERT)] Criando perfis de autores...
[Content-Based (SciBERT)] Treinamento concluído. 5320 perfis criados.

Avaliando modelo: Content-Based (SciBERT)...
  K=5: Precision=0.0070, Recall=0.0060, F1=0.0065
  K=10: Precision=0.0052, Recall=0.0077, F1=0.0062
  K=20: Precision=0.0036, Recall=0.0100, F1=0.0053
  K=50: Precision=0.0019, Recall=0.0128, F1=0.0033
