In [2]:
main_paper_id="87875a07976c26f82705de1fc70041169e5d652b"

In [1]:
import requests
import time
from collections import deque
import os
import zipfile
import pdfplumber
from tqdm import tqdm

In [3]:
def get_citations_from_semantic_scholar(paper_id):
    url = f"https://api.semanticscholar.org/v1/paper/{paper_id}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        citations = data.get("citations", [])
        return citations
    else:
        print(f"Error: {response.status_code}")
        return []

In [4]:
get_citations_from_semantic_scholar(main_paper_id)

[{'arxivId': '2412.16075',
  'authors': [{'authorId': '2336860219', 'name': 'Kaiyu Yang'},
   {'authorId': '2113249490', 'name': 'Gabriel Poesia'},
   {'authorId': '2337844852', 'name': 'Jingxuan He'},
   {'authorId': '2336828803', 'name': 'Wenda Li'},
   {'authorId': '2336739192', 'name': 'Kristin Lauter'},
   {'authorId': '2248225759', 'name': 'Swarat Chaudhuri'},
   {'authorId': '2336822895', 'name': 'Dawn Song'}],
  'doi': None,
  'intent': [],
  'isInfluential': True,
  'paperId': '7899f3ec633080ac9d9b6458f1e1c35e86e6ec5c',
  'title': 'Formal Mathematical Reasoning: A New Frontier in AI',
  'url': 'https://www.semanticscholar.org/paper/7899f3ec633080ac9d9b6458f1e1c35e86e6ec5c',
  'venue': '',
  'year': 2024},
 {'arxivId': '2412.14141',
  'authors': [{'authorId': '2335869617', 'name': 'Tianyang Gu'},
   {'authorId': '2336059081', 'name': 'Jingjin Wang'},
   {'authorId': '2336026962', 'name': 'Zhihao Zhang'},
   {'authorId': '2336144054', 'name': 'HaoHong Li'}],
  'doi': None,
  'in

In [14]:
import os
import requests
import time
from collections import deque
import csv
from dataclasses import dataclass
from typing import List, Optional

@dataclass
class Paper:
    arxiv_id: Optional[str]
    authors: List[dict]
    doi: Optional[str]
    intent: List[str]
    is_influential: bool
    paper_id: str
    title: str
    url: Optional[str]
    venue: Optional[str]
    year: Optional[int]

def get_citations_from_semantic_scholar(paper_id: str) -> List[Paper]:
    url = f"https://api.semanticscholar.org/v1/paper/{paper_id}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        citations_data = data.get("citations", [])
        citations = [
            Paper(
                arxiv_id=citation.get('arxivId'),
                authors=citation.get('authors', []),
                doi=citation.get('doi'),
                intent=citation.get('intent', []),
                is_influential=citation.get('isInfluential', False),
                paper_id=citation['paperId'],
                title=citation.get('title', ''),
                url=citation.get('url'),
                venue=citation.get('venue', ''),
                year=citation.get('year')
            ) for citation in citations_data
        ]
        return citations
    elif response.status_code == 429:
        print(f"Rate limit exceeded for Paper ID: {paper_id}. Exiting.")
        return []
    else:
        print(f"Error: {response.status_code} for Paper ID: {paper_id}")
        return []

def save_queue(queue, filename="data/queue.txt"):
    with open(filename, 'w') as file:
        for paper_id, depth in queue:
            file.write(f"{paper_id},{depth}\n")

def load_queue(filename="data/queue.txt"):
    if os.path.exists(filename):
        with open(filename, 'r') as file:
            queue = deque()
            for line in file:
                paper_id, depth = line.strip().split(',')
                queue.append((paper_id, int(depth)))
            return queue
    return deque()

def save_visited(visited_papers, arxiv_ids, filename="data/visited.txt"):
    with open(filename, 'w') as file:
        for paper_id in visited_papers:
            file.write(f"{paper_id}\n")
        file.write("---\n")
        for arxiv_id in arxiv_ids:
            file.write(f"{arxiv_id}\n")

def load_visited(filename="data/visited.txt"):
    visited_papers = set()
    arxiv_ids = set()
    if os.path.exists(filename):
        with open(filename, 'r') as file:
            lines = file.readlines()
            separator = lines.index('---\n')  # Розділювач між visited_papers та arxiv_ids
            visited_papers = set(line.strip() for line in lines[:separator])
            arxiv_ids = set(line.strip() for line in lines[separator+1:])
    return visited_papers, arxiv_ids

def find_all_citations(paper_id: str, delay=0.1, queue_file="data/queue.txt", visited_file="data/visited.txt"):
    papers = []

    # Завантажуємо чергу, відвідані документи та знайдені ArXiv IDs
    queue = load_queue(queue_file)
    visited_papers, arxiv_ids = load_visited(visited_file)

    if not queue:
        queue.append((paper_id, 0))

    try:
        while queue:
            current_paper_id, depth = queue.popleft()
            if current_paper_id in visited_papers:
                continue
            visited_papers.add(current_paper_id)
            citations = get_citations_from_semantic_scholar(current_paper_id)
            if citations is None:
                break
            print(f"Depth: {depth}, Unique ArXiv IDs found: {len(arxiv_ids)}")
            for citation in citations:
                citation_id = citation.paper_id
                if citation.arxiv_id:
                    arxiv_ids.add(citation.arxiv_id)
                papers.append(citation)
                if citation_id not in visited_papers:
                    queue.append((citation_id, depth + 1))
            time.sleep(delay)
    except KeyboardInterrupt:
        print("\nProcess interrupted. Progress saved.")
        save_queue(queue, queue_file)
        save_visited(visited_papers, arxiv_ids, visited_file)
        return papers

    # Зберігаємо прогрес після завершення
    save_queue(queue, queue_file)
    save_visited(visited_papers, arxiv_ids, visited_file)
    return papers

def save_papers_to_csv(papers: List[Paper], filename="data/arxiv_ids.csv"):
    if not os.path.exists(filename):
        # Якщо файл не існує, створюємо його з заголовками
        with open(filename, mode="w", newline="") as file:
            writer = csv.writer(file)
            writer.writerow(["arxiv_id", "authors", "doi", "intent", "is_influential", "paper_id", "title", "url", "venue", "year"])
    with open(filename, mode="a", newline="") as file:
        writer = csv.writer(file)
        for paper in papers:
            authors = ", ".join([author['name'] for author in paper.authors])
            writer.writerow([paper.arxiv_id, authors, paper.doi, "; ".join(paper.intent), paper.is_influential, paper.paper_id, paper.title, paper.url, paper.venue, paper.year])

def load_existing_papers(filename="data/arxiv_ids.csv"):
    existing_papers = set()
    if os.path.exists(filename):
        with open(filename, mode="r") as file:
            reader = csv.reader(file)
            next(reader)  # Пропускаємо заголовок
            for row in reader:
                existing_papers.add(row[5])  # Додаємо paper_id до існуючих
    return existing_papers

main_paper_id = "87875a07976c26f82705de1fc70041169e5d652b"
existing_papers = load_existing_papers("data/arxiv_ids.csv")
papers = find_all_citations(main_paper_id, delay=0.1)

print(f"\nЗбереження інформації про {len(papers)} пейперів.")
save_papers_to_csv(papers, "data/arxiv_ids.csv")
print("Дані збережено у файл 'data/arxiv_ids.csv'. Прогрес збережено.")



Depth: 1, Unique ArXiv IDs found: 163
Depth: 1, Unique ArXiv IDs found: 173
Depth: 1, Unique ArXiv IDs found: 174
Depth: 1, Unique ArXiv IDs found: 178
Depth: 1, Unique ArXiv IDs found: 187
Depth: 1, Unique ArXiv IDs found: 187
Depth: 1, Unique ArXiv IDs found: 187
Depth: 1, Unique ArXiv IDs found: 188
Depth: 1, Unique ArXiv IDs found: 188
Depth: 1, Unique ArXiv IDs found: 189
Depth: 1, Unique ArXiv IDs found: 190
Depth: 1, Unique ArXiv IDs found: 200
Depth: 1, Unique ArXiv IDs found: 203
Depth: 1, Unique ArXiv IDs found: 203
Depth: 1, Unique ArXiv IDs found: 203
Depth: 1, Unique ArXiv IDs found: 204
Depth: 1, Unique ArXiv IDs found: 204
Depth: 1, Unique ArXiv IDs found: 208
Depth: 1, Unique ArXiv IDs found: 210
Depth: 1, Unique ArXiv IDs found: 214
Depth: 1, Unique ArXiv IDs found: 216
Depth: 1, Unique ArXiv IDs found: 219
Depth: 1, Unique ArXiv IDs found: 220
Depth: 1, Unique ArXiv IDs found: 221
Depth: 1, Unique ArXiv IDs found: 222
Depth: 1, Unique ArXiv IDs found: 222
Depth: 1, Un

In [15]:
import os
import requests
import zipfile
import time
import pdfplumber
from tqdm import tqdm
import csv

def download_paper(arxiv_id, save_dir="data/arxiv_papers"):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(url)

    if response.status_code == 200:
        file_path = os.path.join(save_dir, f"{arxiv_id}.pdf")
        with open(file_path, 'wb') as f:
            f.write(response.content)
        return file_path
    else:
        return None

def extract_text_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Помилка при обробці PDF: {e}")
        return None

def add_to_zip(zip_file, file_path):
    zip_file.write(file_path, os.path.basename(file_path))

def load_processed_ids(filename="data/processed_ids.txt"):
    """Завантажуємо список вже оброблених арXiv ID."""
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return set(line.strip() for line in f)
    return set()

def save_processed_id(arxiv_id, filename="data/processed_ids.txt"):
    """Зберігаємо оброблений арXiv ID у файл."""
    with open(filename, 'a') as f:
        f.write(f"{arxiv_id}\n")

def download_and_extract_all_papers(arxiv_ids_file="data/arxiv_ids.csv", save_dir="data/arxiv_papers", zip_filename="data/papers_archive.zip"):
    arxiv_ids = []
    # Читаємо арXiv IDs з CSV файлу
    with open(arxiv_ids_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            arxiv_id = row['arxiv_id']
            if arxiv_id:
                arxiv_ids.append(arxiv_id)

    total_papers = len(arxiv_ids)
    processed_ids = load_processed_ids("data/processed_ids.txt")

    with zipfile.ZipFile(zip_filename, 'a', zipfile.ZIP_DEFLATED) as zip_file:
        with tqdm(total=total_papers, desc="Завантаження і обробка пейперів", unit="пейпер", ncols=100) as pbar:
            start_time = time.time()
            for i, arxiv_id in enumerate(arxiv_ids, 1):
                if arxiv_id in processed_ids:
                    # Пропускаємо вже оброблені пейпери
                    pbar.update(1)
                    continue

                pdf_path = download_paper(arxiv_id, save_dir)
                if pdf_path:
                    # Витягуємо текст з PDF
                    text = extract_text_from_pdf(pdf_path)
                    if text:
                        # Створюємо текстовий файл і додаємо його до архіву
                        text_filename = os.path.join(save_dir, f"{arxiv_id}.txt")
                        with open(text_filename, 'w', encoding='utf-8') as text_file:
                            text_file.write(text)
                        add_to_zip(zip_file, text_filename)

                    # Зберігаємо оброблений ID
                    save_processed_id(arxiv_id, "data/processed_ids.txt")

                    pbar.set_postfix({'Залишилось': f"{total_papers - i} пейперів"})
                else:
                    pbar.set_postfix({'Залишилось': f"{total_papers - i} пейперів", 'Статус': 'Помилка'})
                pbar.update(1)

            elapsed_time = time.time() - start_time
            estimated_time_remaining = elapsed_time / i * (total_papers - i) if i > 0 else 0
            print(f"Час виконання: {elapsed_time:.2f} сек. Оцінка часу до завершення: {estimated_time_remaining:.2f} сек.")

    print(f"Всі файли архівовані в: {zip_filename}")

# Виклик функції для завантаження та архівації пейперів
download_and_extract_all_papers(arxiv_ids_file="data/arxiv_ids.csv", save_dir="data/arxiv_papers", zip_filename="data/papers_archive.zip")



Завантаження і обробка пейперів:   2%| | 21/1400 [03:07<3:24:41,  8.91s/пейпер, Залишилось=1379 пейп


KeyboardInterrupt: 

In [8]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model_name = "EleutherAI/gpt-neo-1.3B"
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

model.save_pretrained("local_model")
tokenizer.save_pretrained("local_model")


('local_model\\tokenizer_config.json',
 'local_model\\special_tokens_map.json',
 'local_model\\vocab.json',
 'local_model\\merges.txt',
 'local_model\\added_tokens.json')

In [13]:
import zipfile
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
import os
from tqdm import tqdm

model_name = "EleutherAI/gpt-neo-1.3B"
# model = GPTNeoForCausalLM.from_pretrained(model_name)
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)

model = GPTNeoForCausalLM.from_pretrained("local_model")
tokenizer = GPT2Tokenizer.from_pretrained("local_model")

zip_file_path = "papers_archive.zip"
extracted_files_dir = "extracted_papers"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_files_dir)
def summarize_text(text, max_length=200):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    outputs = model.generate(**inputs, max_new_tokens=max_length)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

file_list = [f for f in os.listdir(extracted_files_dir) if f.endswith(".txt")]

summaries = []
with tqdm(total=len(file_list), desc="Обробка файлів", unit="файл") as pbar:
    for file_name in file_list:
        file_path = os.path.join(extracted_files_dir, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            summary = summarize_text(text)
            summaries.append(summary)
        pbar.update(1)

full_summary = " ".join(summaries)
final_summary = summarize_text(full_summary)

print("Фінальний висновок про статті:")
print(final_summary)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Обробка файлів:   0%|          | 1/631 [1:53:15<1189:12:41, 6795.49s/файл]


KeyboardInterrupt: 

In [None]:
import os
import zipfile
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from tqdm import tqdm

# Шлях до архіву
zip_file_path = "papers_archive.zip"

# Завантаження моделі та токенайзера на GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "EleutherAI/gpt-neo-1.3B"
model = GPTNeoForCausalLM.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Функція для обробки одного файлу
def process_file(file_name, content):
    input_text = content.decode("utf-8")[:1000]  # Візьмемо лише перші 1000 символів для прикладу
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=150, pad_token_id=tokenizer.eos_token_id)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Розпакування архіву та обробка файлів
def process_files_from_zip(zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()
        for file_name in tqdm(file_list, desc="Обробка файлів"):
            if file_name.endswith('.txt'):
                with zip_ref.open(file_name) as file:
                    content = file.read()
                    summary = process_file(file_name, content)
                    save_summary(file_name, summary)

# Функція для збереження результатів
def save_summary(file_name, summary):
    with open(f"summary_{file_name.replace('/', '_')}.txt", "w", encoding="utf-8") as f:
        f.write(summary)

# Виклик функції обробки файлів
process_files_from_zip(zip_file_path)



Обробка файлів:   5%|▍         | 31/631 [1:55:27<36:24:35, 218.46s/it]