<a href="https://colab.research.google.com/github/Ashrith3456/-report-on-the-impact-of-Generative-AI-on-software-development-and-low-code-no-code-platforms.-/blob/main/AI_Literature_Review_Assistant_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import nltk, os

# Make sure NLTK has a place to store data
os.makedirs("/root/nltk_data", exist_ok=True)
nltk.data.path.append("/root/nltk_data")

# Download both (some environments require punkt_tab too)
nltk.download("punkt")
try:
    nltk.download("punkt_tab")
except Exception as e:
    print("punkt_tab not available in this NLTK build; safe to ignore.")

print("âœ… NLTK tokenizers ready")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


âœ… NLTK tokenizers ready


In [7]:
# === FINAL ONE-CELL LITERATURE REVIEW ASSISTANT (no API needed) ===
# Install deps (and pin requests to avoid Colab conflicts)
with open("constraints.txt","w") as f: f.write("requests==2.32.4\n")
!pip -q install -c constraints.txt pypdf nltk sumy sentence-transformers

# Imports & setup
import os, re
from collections import Counter
from google.colab import files
import nltk
nltk.download("punkt")
os.makedirs("pdfs", exist_ok=True)
os.makedirs("outputs", exist_ok=True)

# --- Ensure we have paper1.pdf & paper2.pdf ---
have1 = os.path.exists("pdfs/paper1.pdf")
have2 = os.path.exists("pdfs/paper2.pdf")
if not (have1 and have2):
    print("ðŸ“¥ Upload two PDFs (any names). Theyâ€™ll be saved as paper1.pdf & paper2.pdf")
    uploaded = files.upload()  # pick two files
    picked = [n for n in uploaded.keys() if n.lower().endswith(".pdf")]
    assert len(picked) >= 2, "Please upload at least TWO PDF files."
    os.rename(picked[0], "pdfs/paper1.pdf")
    os.rename(picked[1], "pdfs/paper2.pdf")

# --- Helpers (extract, summarize, keywords, compare, gaps) ---
from pypdf import PdfReader
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def extract_text(pdf_path, max_pages=12):
    reader = PdfReader(pdf_path)
    pages = reader.pages[:max_pages]
    return "\n".join(p.extract_text() or "" for p in pages)

def summarize_text(text, n_sentences=8):
    text = (text or "").strip()
    if not text: return "No content found."
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    sents = LexRankSummarizer()(parser.document, n_sentences)
    return "\n".join(str(s) for s in sents)

def extract_keywords(text, top_k=15):
    words = [w.lower() for w in re.findall(r"[A-Za-z][A-Za-z\-]{2,}", text or "")]
    stop = set("""the a an and or of in on for to from with by as that this these those is are was were be being been it its
    at we you they he she him her their our your not into over under out more less most least about using use
    method methods approach approaches result results data model models paper study studies system systems""".split())
    cand = [w for w in words if w not in stop]
    return [w for w,_ in Counter(cand).most_common(top_k)]

def compare_summaries(sum1, sum2):
    set1, set2 = set(sum1.lower().split()), set(sum2.lower().split())
    overlap = " ".join(list((set1 & set2))[:30])
    only1 = " ".join(list(set1 - set2)[:30])
    only2 = " ".join(list(set2 - set1)[:30])  # <-- FIXED HERE
    return overlap, only1, only2

def infer_gaps(*texts):
    joined = "\n".join(texts)
    kws = extract_keywords(joined, 12)
    focus = ", ".join(kws[:6]) if kws else "the topic"
    return f"""- Limited standardized benchmarks & head-to-head comparisons
- Weak generalization across datasets/projects; need cross-domain evaluation
- Sparse error analysis & failure taxonomy
- Reproducibility issues (datasets, seeds, exact configs)
- Limited reporting on computational cost / flakiness
Focus cues: {focus}"""

# --- Process both PDFs ---
P1, P2 = "pdfs/paper1.pdf", "pdfs/paper2.pdf"
t1, t2 = extract_text(P1), extract_text(P2)
s1, s2 = summarize_text(t1), summarize_text(t2)
k1, k2 = extract_keywords(t1), extract_keywords(t2)
overlap, u1, u2 = compare_summaries(s1, s2)
gaps = infer_gaps(s1, s2)

# --- Build report + download ---
report = f"""# Literature Review Assistant â€“ Report

## Papers
- **Paper 1:** {P1}
- **Paper 2:** {P2}

---

## Summary â€“ Paper 1
{s1}

**Keywords:** {", ".join(k1)}

---

## Summary â€“ Paper 2
{s2}

**Keywords:** {", ".join(k2)}

---

## Comparison (token overlap heuristic)
- **Overlap (examples):** {overlap}
- **Unique to Paper 1:** {u1}
- **Unique to Paper 2:** {u2}

---

## Research Gaps (heuristic)
{gaps}
"""
out_path = "outputs/lit_review_report.md"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(report)

print("âœ… Done. Downloading reportâ€¦")
files.download(out_path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


âœ… Done. Downloading reportâ€¦


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>