# Summarizing abstracts

In [1]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import pipeline

# Load the summarizer (only once)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


2025-04-02 18:28:45.940661: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Path to my json file with the openalex results (1000 papers)
json_path = "/Users/dionnespaltman/Desktop/Luiss /Data Science in Action/Project/openalex_results_clean.json"

# Open and load the JSON data
with open(json_path, 'r') as f:
    data = json.load(f)

# Convert to DataFrame 
df = pd.DataFrame(data)
df_clean = df[df['abstract'].notna()].copy()
docs = (df_clean['title'] + ". " + df_clean['abstract']).tolist()

# Topic modeling
topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")
topics, probs = topic_model.transform(docs)

df_clean['topic_id'] = topics
df_clean['topic_label'] = df_clean['topic_id'].apply(
    lambda x: topic_model.topic_labels_[x] if x != -1 and x < len(topic_model.topic_labels_) else "Unknown"
)

# Embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(docs, show_progress_bar=True)
df_clean['embedding'] = list(embeddings)

# Finalize main paper DataFrame
papers_df = df_clean.copy()
paper_embeddings = np.vstack(papers_df['embedding'].values)


In [None]:
# Choose a long abstract to test
text_to_summarize = papers_df.iloc[0]['abstract']

# Optional: truncate to 1024 tokens (model limit)
summary = summarizer(text_to_summarize, max_length=80, min_length=30, do_sample=False)[0]['summary_text']

print("📄 Original abstract:\n", text_to_summarize)
print("\n📝 Summary:\n", summary)
