In [1]:
!pip -qqq install langchain-groq --progress-bar off
!pip -qqq install torch --progress-bar off
!pip -qqq install markdown --progress-bar off
!pip -qqq install beautifulsoup4 --progress-bar off
!pip install sentence-transformers
!pip install langchain
!pip install httpx==0.23.0
!pip install langchain-ollama

Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/8b/c8/990e22a465e4771338da434d799578865d6d7ef1fdb50bd844b7ecdcfa19/sentence_transformers-3.3.1-py3-none-any.whl.metadata
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.41.0 from https://files.pythonhosted.org/packages/45/d6/a69764e89fc5c2c957aa473881527c8c35521108d553df703e9ba703daeb/transformers-4.48.0-py3-none-any.whl.metadata
  Using cached transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
Collecting tqdm (from sentence-transformers)
  Obtaining dependency information for tqdm from https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl.metadata
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
C

In [2]:
!pip install langchain-together

Collecting langchain-together
  Obtaining dependency information for langchain-together from https://files.pythonhosted.org/packages/a8/69/b3cbcf5b43acbc098c012ef75035fb0dc1e0f227f5161329ef9884a25ba4/langchain_together-0.3.0-py3-none-any.whl.metadata
  Using cached langchain_together-0.3.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-openai<0.4,>=0.3 (from langchain-together)
  Obtaining dependency information for langchain-openai<0.4,>=0.3 from https://files.pythonhosted.org/packages/4a/9c/b38e308ac668f6db067b424a2a78e5b865753c144a119456f008a09230db/langchain_openai-0.3.0-py3-none-any.whl.metadata
  Using cached langchain_openai-0.3.0-py3-none-any.whl.metadata (2.7 kB)
Collecting openai<2.0.0,>=1.58.1 (from langchain-openai<0.4,>=0.3->langchain-together)
  Obtaining dependency information for openai<2.0.0,>=1.58.1 from https://files.pythonhosted.org/packages/70/45/6de8e5fd670c804b29c777e4716f1916741c71604d5c7d952eee8432f7d3/openai-1.59.6-py3-none-any.whl.metadata
  Using ca

In [6]:
import os
import pandas as pd
from langchain_groq import ChatGroq
import json
import markdown
import bs4
from typing import Dict, List, Any
import yaml
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
import hashlib
from tqdm import tqdm
import time
import asyncio
from collections import deque
from datetime import datetime, timedelta
from langchain_ollama import ChatOllama
from langchain_together import ChatTogether

class TokenBucket:
    def __init__(self, tokens_per_minute: int = 5000):
        self.capacity = tokens_per_minute
        self.tokens = tokens_per_minute
        self.last_updated = datetime.now()
        self.tokens_per_minute = tokens_per_minute
        self.lock = asyncio.Lock()

    async def get_tokens(self, requested_tokens: int) -> bool:
        async with self.lock:
            now = datetime.now()
            time_passed = (now - self.last_updated).total_seconds() / 60.0
            
            # Refill tokens based on time passed
            self.tokens = min(
                self.capacity,
                self.tokens + (self.tokens_per_minute * time_passed)
            )
            self.last_updated = now

            if self.tokens >= requested_tokens:
                self.tokens -= requested_tokens
                return True
            return False

    async def wait_for_tokens(self, requested_tokens: int):
        while not await self.get_tokens(requested_tokens):
            await asyncio.sleep(1)

class PaperContentExtractor:
    def __init__(self, groq_api_key: str, max_workers: int = 3, cache_dir: str = None):
        self.llm = ChatGroq(
            temperature=0,
            api_key="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",
            model_name="llama-3.1-70b-versatile"
        )
        self.max_workers = max_workers
        self.token_bucket = TokenBucket(tokens_per_minute=5000)
        self.cache_dir = cache_dir or os.path.join(os.getcwd(), 'feature_cache')
        os.makedirs(self.cache_dir, exist_ok=True)
        
        self.section_keywords = {
            'introduction': ['introduction', 'background', 'overview', 'abstract'],
            'methodology': ['method', 'approach', 'implementation', 'proposed', 'architecture', 'system', 'model'],
            'results': ['result', 'evaluation', 'experiment', 'performance', 'analysis', 'finding'],
            'conclusion': ['conclusion', 'discussion', 'future', 'summary']
        }

    @staticmethod
    def _get_section_hash(content: str) -> str:
        return hashlib.md5(content.encode()).hexdigest()
    
    def _split_by_sections(self, content: str) -> List[tuple]:
        sections = []
        lines = content.split('\n')
        current_section = None
        current_content = []
        
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            
            # Check for markdown headers (# style)
            header_match = re.match(r'^#+\s+(.+)$', line)
            if header_match:
                if current_section and current_content:
                    sections.append((current_section, '\n'.join(current_content)))
                current_section = header_match.group(1).strip()
                current_content = []
            elif line and current_section is not None:
                current_content.append(line)
            elif line and not sections:  # Content before first heading
                current_section = "Introduction"
                current_content.append(line)
            
            i += 1
        
        # Add the last section
        if current_section and current_content:
            sections.append((current_section, '\n'.join(current_content)))
        
        print(f"Found sections: {[section[0] for section in sections]}")
        return sections

    def _get_cache_path(self, section_hash: str) -> str:
        return os.path.join(self.cache_dir, f"{section_hash}.json")

    def _load_from_cache(self, section_hash: str) -> Dict:
        cache_path = self._get_cache_path(section_hash)
        if os.path.exists(cache_path):
            try:
                with open(cache_path, 'r') as f:
                    return json.load(f)
            except:
                return None
        return None

    def _save_to_cache(self, section_hash: str, features: Dict):
        cache_path = self._get_cache_path(section_hash)
        with open(cache_path, 'w') as f:
            json.dump(features, f)

    def _get_empty_features(self) -> Dict:
        return {
            "Topic of Research": "",
            "Research Objective": "",
            "Methodology": "",
            "Results": "",
            "Novelty Claims": "",
            "Evaluation Metrics": "",
            "Category of Research": ""
        }

    async def _extract_section_features(self, section: str) -> Dict:
        # Estimate tokens (rough approximation)
        estimated_tokens = len(section.split()) * 1.5
        
        await self.token_bucket.wait_for_tokens(estimated_tokens)
        
        prompt = f"""Analyze this research paper section and extract key information.
        Return a JSON object with these keys (use empty string if information is not found):
        {{
            "Topic of Research": "The main research topic or focus area",
            "Research Objective": "The specific goals or objectives",
            "Methodology": "Methods, approaches, or techniques used",
            "Results": "Key findings or outcomes",
            "Novelty Claims": "Claims about new contributions or innovations",
            "Evaluation Metrics": "Metrics used to evaluate results",
            "Category of Research": "Type of research (e.g., empirical, theoretical, applied)"
        }}
        STRICTLY return in JSON format only. No other text
        Section content:
        {section}"""
        
        try:
            response = self.llm.invoke(prompt)
            print(response)
            try:
                return json.loads(response.content)
            except json.JSONDecodeError:
                print(f"Failed to parse JSON response: {response.content[:200]}...")
                return self._get_empty_features()
        except Exception as e:
            if "rate_limit_exceeded" in str(e):
                print("Rate limit exceeded, waiting before retry...")
                await asyncio.sleep(2)
                return await self._extract_section_features(section)
            print(f"Error in LLM call: {str(e)}")
            return self._get_empty_features()

    async def _process_section(self, section_tuple: tuple) -> Dict:
        header, content = section_tuple
        if len(content.strip()) < 50:
            return self._get_empty_features()
        
        section_hash = self._get_section_hash(content)
        
        # Try to load from cache first
        cached_features = self._load_from_cache(section_hash)
        if cached_features:
            return cached_features
        
        features = await self._extract_section_features(content)
        self._save_to_cache(section_hash, features)
        
        print(f"Processed section: {header[:50]}...")
        return features

    def _merge_features(self, features_list: List[Dict]) -> Dict:
        if not features_list:
            return self._get_empty_features()
            
        merged = self._get_empty_features()
        
        for key in merged:
            values = [f[key] for f in features_list if f[key] and f[key].strip()]
            if values:
                merged[key] = max(values, key=len)
        
        return merged

    async def extract_paper_features(self, file_path: str) -> Dict[str, Any]:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='latin-1') as f:
                content = f.read()
        
        sections = self._split_by_sections(content)
        print(f"Found {len(sections)} sections in {file_path}")
        
        features_list = []
        tasks = [self._process_section(section) for section in sections]
        features_list = await asyncio.gather(*tasks)
        
        merged_features = self._merge_features([f for f in features_list if any(f.values())])
        return merged_features

async def process_papers(papers_dir: str, output_dir: str, groq_api_key: str = None, max_workers: int = 3):
    if not groq_api_key:
        raise ValueError("Please provide GROQ_API_KEY")
            
    extractor = PaperContentExtractor(groq_api_key, max_workers)
    os.makedirs(output_dir, exist_ok=True)
    
    columns = [
        "Paper Code", "Topic of Research", "Research Objective", "Methodology",
        "Results", "Novelty Claims", "Evaluation Metrics",
        "Category of Research"
    ]
    
    paper_data = []
    filenames = []
    md_files = [f for f in os.listdir(papers_dir) if f.endswith('.md')]
    
    if not md_files:
        print(f"No markdown files found in {papers_dir}")
        return pd.DataFrame(columns=columns)
    
    for filename in tqdm(md_files, desc="Processing papers"):
        paper_path = os.path.join(papers_dir, filename)
        output_path = os.path.join(output_dir, f"{filename[:-3]}_features.yaml")
        
        try:
            features = await extractor.extract_paper_features(paper_path)
            paper_data.append(features)
            
            with open(output_path, 'w') as f:
                yaml.dump(features, f)
                
            print(f"Successfully processed {filename}")
            filenames.append(filename)
            df_paper = pd.DataFrame(paper_data, columns=columns)
            df_paper['filename'] = filenames
            df_paper.to_csv('paper_analysis_results.csv', index=False)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
    
    return df_paper

In [None]:
PAPERS_DIR = "Parsed_Docs/Papers"  # Directory containing markdown papers
OUTPUT_DIR = "target"  # Directory for output features
df = await process_papers(
    PAPERS_DIR, 
    OUTPUT_DIR, 
    groq_api_key="<GROQ_API_KEY>"
)

Processing papers: 100%|██████████| 7/7 [00:00<00:00, 106.52it/s]

Found sections: ['Abstract', '1 Introduction', '2 Related Work', '3 Methodology', '1. Introduction', '4. Experiments', '4.1 Comparing Different Basis Expansions', '4. Key Takeaways', '4.2 The Necessity of Ensembles of Dynamic Ensembles', 'Key Takeaway', '4.3 E-DOEBE Outperforms Other Methods', 'Key Takeaway', '5 Conclusion', '6 Tables']
Found 14 sections in /Users/avinandan/Desktop/KDSH Task 1/P009_parsed.md
Successfully processed P009_parsed.md
Found sections: ['Abstract', '1 Introduction', '2 Related Work', 'Theoretical Models of Entropy', 'Methodology', 'Research Findings on Entropy and Complex Systems', 'Research Insights on Entropy and Its Applications', 'Experiments', 'Research Journey', '5 Results', 'Table 3: Entropy levels in various systems', 'Conclusion']
Found 12 sections in /Users/avinandan/Desktop/KDSH Task 1/P119_parsed.md
Successfully processed P119_parsed.md
Found sections: ['Abstract', '1 Introduction', '2 Related Work', '3 Baseline Parsing Pipeline', '3.1 Segmentation


