In [23]:
# ============================================================================
# ABSTRA Framework: Automated Scientific Discovery Pipeline
# Phase 1-4 Implementation (Literature Processing → Hypothesis Refinement)
# ============================================================================

In [24]:
!pip install captum==0.7.0 transformers accelerate -q

In [25]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import pandas as pd
from captum.attr import FeatureAblation, ShapleyValueSampling, LLMAttribution, TextTemplateInput
import nltk
import json
import gc
import os
import time
import re
from tqdm import tqdm
import logging
from google.colab import drive

In [26]:
# Cell 3: Mount Drive and Setup
drive.mount('/content/drive')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nltk.download('punkt', quiet=True)
print(f"Using device: {device}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda


In [27]:
# Cell 4: Configuration
class Config:
    """Centralized configuration for the ABSTRA pipeline"""
    MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    CSV_PATH = '/content/drive/My Drive/llm_hyp/Data/raw_data_2.csv'
    OUTPUT_DIR = '/content/drive/My Drive/llm_hyp/results/pre_final'
    BATCH_SIZE = 16
    NUM_HYPOTHESES = 3

In [28]:
# Cell 5: Utility Functions
def setup_logging(output_dir):
    """Setup logging"""
    os.makedirs(output_dir, exist_ok=True)
    log_path = os.path.join(output_dir, 'abstra_log.txt')

    # Clear existing handlers
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_path, mode='w'),
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger(__name__)
    logger.info(f"Logging to {log_path}")
    return logger

def clear_memory():
    """Clear GPU memory"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()


In [29]:
# Cell 6: Model Management
def load_model_and_tokenizer(model_name=Config.MODEL_NAME):
    """Load model and tokenizer"""
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    ).to(device)
    model.eval()
    print("Model loaded successfully")
    return model, tokenizer

def generate_response(prompt, model, tokenizer, max_length=3000):
    """Generate LLM response"""
    try:
        messages = [{"role": "user", "content": prompt}]
        chat = tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = tokenizer(chat, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                temperature=0.7,
                do_sample=True,
                repetition_penalty=1.2,
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        clear_memory()
        return response
    except Exception as e:
        logging.error(f"Error generating response: {str(e)}")
        return ""


In [30]:
# Cell 7: Phase 1 - Segmentation
def segment_abstract(abstract):
    """Segment abstract into 5 sections"""
    try:
        sentences = nltk.sent_tokenize(abstract)
    except:
        sentences = [s.strip() for s in abstract.split('.') if s.strip()]

    features = {
        'background': [],
        'objective': [],
        'methods': [],
        'results': [],
        'conclusion': []
    }

    if len(sentences) <= 3:
        features['methods'] = sentences
    else:
        for i, sentence in enumerate(sentences):
            position = i / len(sentences)
            if position < 0.2:
                features['background'].append(sentence)
            elif position < 0.4:
                features['objective'].append(sentence)
            elif position < 0.6:
                features['methods'].append(sentence)
            elif position < 0.8:
                features['results'].append(sentence)
            else:
                features['conclusion'].append(sentence)

    return features

In [31]:
# Cell 8: Phase 2 - Hypothesis Generation
def generate_hypotheses(abstract, title, model, tokenizer):
    """Generate 3 hypotheses"""
    prompts = [
        f"Read this scientific paper abstract and identify its main hypothesis.\n\nTitle: {title}\nAbstract: {abstract}\n\nWhat is the main hypothesis?",
        f"Based on this abstract titled '{title}', what is the central hypothesis being tested?\n\nAbstract: {abstract}",
        f"Scientific Abstract: {abstract}\nTitle: {title}\n\nExtract the primary research hypothesis. Be specific."
    ]

    hypotheses = []
    for i, prompt in enumerate(prompts):
        hyp_text = generate_response(prompt, model, tokenizer)
        # Clean response
        if "<|assistant|>" in hyp_text:
            hyp_text = hyp_text.split("<|assistant|>")[1].strip()

        hypotheses.append({
            'hypothesis_id': i + 1,
            'hypothesis_text': hyp_text
        })

    return hypotheses


In [32]:
# Cell 9: Phase 3 - Feature Ablation
def compute_feature_ablation(features, hypothesis, model, tokenizer):
    """Compute FA scores"""
    all_sentences = []
    feature_indices = {}
    current_idx = 0

    for section in ['background', 'objective', 'methods', 'results', 'conclusion']:
        feature_indices[section] = (current_idx, current_idx + len(features[section]))
        all_sentences.extend(features[section])
        current_idx += len(features[section])

    if not all_sentences:
        return {s: 0.0 for s in ['background', 'objective', 'methods', 'results', 'conclusion']}

    try:
        template = " ".join(["{}"]*len(all_sentences))
        inp = TextTemplateInput(template, values=all_sentences)

        fa = FeatureAblation(model)
        llm_attr = LLMAttribution(fa, tokenizer)

        with torch.amp.autocast('cuda'):
            attr_res = llm_attr.attribute(inp, target=hypothesis, skip_tokens=torch.tensor([1]))

        section_scores = {}
        for section, (start, end) in feature_indices.items():
            section_scores[section] = attr_res.seq_attr[start:end].mean().item() if end > start else 0.0

        clear_memory()
        return section_scores
    except Exception as e:
        logging.error(f"FA error: {str(e)}")
        return {s: 0.0 for s in ['background', 'objective', 'methods', 'results', 'conclusion']}


In [33]:
# Cell 10: Phase 3 - Shapley Values
def compute_shapley_values(features, hypothesis, model, tokenizer):
    """Compute Shapley scores"""
    all_sentences = []
    feature_indices = {}
    current_idx = 0

    for section in ['background', 'objective', 'methods', 'results', 'conclusion']:
        feature_indices[section] = (current_idx, current_idx + len(features[section]))
        all_sentences.extend(features[section])
        current_idx += len(features[section])

    if not all_sentences:
        return {s: 0.0 for s in ['background', 'objective', 'methods', 'results', 'conclusion']}

    try:
        template = " ".join(["{}"]*len(all_sentences))
        inp = TextTemplateInput(template, values=all_sentences)

        shapley = ShapleyValueSampling(model)
        llm_attr = LLMAttribution(shapley, tokenizer)

        with torch.amp.autocast('cuda'):
            attr_res = llm_attr.attribute(inp, target=hypothesis, n_samples=10)

        section_scores = {}
        for section, (start, end) in feature_indices.items():
            section_scores[section] = attr_res.seq_attr[start:end].mean().item() if end > start else 0.0

        clear_memory()
        return section_scores
    except Exception as e:
        logging.error(f"Shapley error: {str(e)}")
        return {s: 0.0 for s in ['background', 'objective', 'methods', 'results', 'conclusion']}

In [34]:
# Cell 11: Phase 3 - Self-Evaluation
def self_evaluate_hypothesis(title, abstract, hypothesis, model, tokenizer):
    """Self-evaluate hypothesis quality"""
    prompt = f"""<|system|>
You are an expert scientific evaluator. Assess how well this hypothesis represents the paper abstract.

<|user|>
PAPER TITLE: {title}

ABSTRACT:
{abstract}

HYPOTHESIS:
{hypothesis}

Evaluate this hypothesis on a scale of 0.0 to 1.0 based on:
1. Clarity and specificity
2. Alignment with abstract content
3. Scientific validity
4. Testability

Provide your score as "FINAL SCORE: X.X" (one decimal place).

<|assistant|>
"""

    response = generate_response(prompt, model, tokenizer, max_length=1024)

    # Extract score
    match = re.search(r"FINAL SCORE:\s*(\d+\.?\d*)", response, re.IGNORECASE)
    if match:
        score = float(match.group(1))
        return round(min(max(score, 0), 1), 1), response

    # Fallback
    matches = re.findall(r"(\d+\.\d+)", response)
    for m in matches:
        num = float(m)
        if 0 <= num <= 1:
            return round(num, 1), response

    return 0.5, response


In [35]:
# Cell 12: Main Processing
def process_single_abstract(row, model, tokenizer, logger):
    """Process one abstract"""
    title = row['Title']  # Capitalized column name
    abstract = row['Abstract']  # Capitalized column name

    logger.info(f"Processing: {title}")

    result = {
        'title': title,
        'abstract': abstract,
        'hypotheses': [],
        'attribution_results': []
    }

    # Phase 1: Segment
    features = segment_abstract(abstract)
    result['features'] = features
    logger.info(f"  Segmented into {sum(len(v) for v in features.values())} sentences")

    # Phase 2: Generate hypotheses
    hypotheses = generate_hypotheses(abstract, title, model, tokenizer)
    result['hypotheses'] = hypotheses
    logger.info(f"  Generated {len(hypotheses)} hypotheses")

    # Phase 3: Attribution & Evaluation
    for hyp_data in hypotheses:
        hyp_id = hyp_data['hypothesis_id']
        hyp_text = hyp_data['hypothesis_text']
        logger.info(f"  Processing hypothesis {hyp_id}")

        # FA
        fa_scores = compute_feature_ablation(features, hyp_text, model, tokenizer)

        # Shapley
        shapley_scores = compute_shapley_values(features, hyp_text, model, tokenizer)

        # Self-eval
        eval_score, eval_text = self_evaluate_hypothesis(title, abstract, hyp_text, model, tokenizer)

        result['attribution_results'].append({
            'hypothesis_id': hyp_id,
            'fa_scores': fa_scores,
            'shapley_scores': shapley_scores,
            'self_eval_score': eval_score,
            'self_eval_text': eval_text
        })

        clear_memory()

    return result

In [36]:
# Cell 13: Create Output CSV
def create_output_csv(results, output_path):
    """Create final CSV matching expected format"""
    rows = []

    for result in results:
        title = result['title']
        abstract = result['abstract']
        features = result['features']

        for hyp_data in result['hypotheses']:
            hyp_id = hyp_data['hypothesis_id']
            hyp_text = hyp_data['hypothesis_text']

            # Find attribution
            attr = next(a for a in result['attribution_results'] if a['hypothesis_id'] == hyp_id)

            row = {
                'title': title,
                'abstract': abstract,
                'hypothesis_id': hyp_id,
                'hypothesis': hyp_text,
                'model_self_evaluated_score': attr['self_eval_score'],  # Match expected column name
                'model_response': attr['self_eval_text'],
                # Add segmented sections
                'abstract_background': ' '.join(features['background']),
                'abstract_objective': ' '.join(features['objective']),
                'abstract_methods': ' '.join(features['methods']),
                'abstract_results': ' '.join(features['results']),
                'abstract_conclusion': ' '.join(features['conclusion']),
            }

            # FA scores
            for section, score in attr['fa_scores'].items():
                row[f'fa_{section}'] = score

            # Shapley scores
            for section, score in attr['shapley_scores'].items():
                row[f'shapley_{section}'] = score

            rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(output_path, index=False)
    return df


In [37]:
# Cell 14: Main Pipeline
def main():
    """Execute ABSTRA pipeline"""
    logger = setup_logging(Config.OUTPUT_DIR)
    logger.info("="*50)
    logger.info("Starting ABSTRA Pipeline")
    logger.info("="*50)

    # Load CSV
    logger.info(f"Loading CSV from: {Config.CSV_PATH}")
    try:
        df = pd.read_csv(Config.CSV_PATH)
        logger.info(f"Loaded {len(df)} abstracts")
        logger.info(f"Columns: {list(df.columns)}")
    except Exception as e:
        logger.error(f"Failed to load CSV: {str(e)}")
        return None

    # Load model
    model, tokenizer = load_model_and_tokenizer()

    # Process abstracts
    results = []
    for idx, row in df.iterrows():
        try:
            logger.info(f"\n--- Processing abstract {idx+1}/{len(df)} ---")
            result = process_single_abstract(row, model, tokenizer, logger)
            results.append(result)
        except Exception as e:
            logger.error(f"Error processing row {idx}: {str(e)}")
            import traceback
            logger.error(traceback.format_exc())
            continue

    if not results:
        logger.error("No results generated!")
        return None

    # Save JSON
    json_path = os.path.join(Config.OUTPUT_DIR, 'complete_results.json')
    with open(json_path, 'w') as f:
        json.dump(results, f, indent=2)
    logger.info(f"Saved JSON to {json_path}")

    # Create CSV
    csv_path = os.path.join(Config.OUTPUT_DIR, 'abstra_results.csv')
    output_df = create_output_csv(results, csv_path)
    logger.info(f"Saved CSV to {csv_path}")
    logger.info(f"CSV shape: {output_df.shape}")

    logger.info("="*50)
    logger.info("Pipeline Complete!")
    logger.info("="*50)

    return output_df


In [38]:
# Cell 15: Execute
if __name__ == "__main__":
    results_df = main()
    if results_df is not None:
        print("\n" + "="*50)
        print("SUCCESS! Sample output:")
        print("="*50)
        print(results_df.head())
        print(f"\nTotal rows: {len(results_df)}")
        print(f"Columns: {list(results_df.columns)}")

2025-09-22 01:53:37,402 - INFO - Logging to /content/drive/My Drive/llm_hyp/results/pre_final/abstra_log.txt
2025-09-22 01:53:37,411 - INFO - Starting ABSTRA Pipeline
2025-09-22 01:53:37,413 - INFO - Loading CSV from: /content/drive/My Drive/llm_hyp/Data/raw_data_2.csv
2025-09-22 01:53:37,419 - INFO - Loaded 2 abstracts
2025-09-22 01:53:37,420 - INFO - Columns: ['Title', 'Abstract']


Loading model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


2025-09-22 01:53:45,502 - INFO - 
--- Processing abstract 1/2 ---
2025-09-22 01:53:45,503 - INFO - Processing: Automating the practice of science: Opportunities, challenges, and implications
2025-09-22 01:53:45,507 - INFO -   Segmented into 4 sentences


Model loaded successfully


2025-09-22 01:54:28,644 - INFO -   Generated 3 hypotheses
2025-09-22 01:54:28,646 - INFO -   Processing hypothesis 1
2025-09-22 01:59:14,509 - INFO -   Processing hypothesis 2
2025-09-22 02:08:57,745 - INFO -   Processing hypothesis 3
2025-09-22 02:16:58,682 - INFO - 
--- Processing abstract 2/2 ---
2025-09-22 02:16:58,683 - INFO - Processing: Random Forests for Heteroscedastic Data
2025-09-22 02:16:58,686 - INFO -   Segmented into 8 sentences
2025-09-22 02:17:17,221 - INFO -   Generated 3 hypotheses
2025-09-22 02:17:17,222 - INFO -   Processing hypothesis 1
2025-09-22 02:30:01,847 - INFO -   Processing hypothesis 2
2025-09-22 02:41:02,724 - INFO -   Processing hypothesis 3
2025-09-22 02:55:37,991 - INFO - Saved JSON to /content/drive/My Drive/llm_hyp/results/pre_final/complete_results.json
2025-09-22 02:55:38,019 - INFO - Saved CSV to /content/drive/My Drive/llm_hyp/results/pre_final/abstra_results.csv
2025-09-22 02:55:38,021 - INFO - CSV shape: (6, 21)
2025-09-22 02:55:38,022 - INFO 


SUCCESS! Sample output:
                                               title  \
0  Automating the practice of science: Opportunit...   
1  Automating the practice of science: Opportunit...   
2  Automating the practice of science: Opportunit...   
3            Random Forests for Heteroscedastic Data   
4            Random Forests for Heteroscedastic Data   

                                            abstract  hypothesis_id  \
0  Automation transformed various aspects of our ...              1   
1  Automation transformed various aspects of our ...              2   
2  Automation transformed various aspects of our ...              3   
3  Random forests are a popular machine learning ...              1   
4  Random forests are a popular machine learning ...              2   

                                          hypothesis  \
0  The main hypothesis of the scientific paper is...   
1  The central hypothesis of this abstract is tha...   
2  The potential impact of automation on sc