<a href="https://colab.research.google.com/github/Aishwarya-s-m/sumsung_project/blob/main/Research_paper3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

!pip install streamlit pyngrok cloudflared PyPDF2 python-docx openpyxl rake-nltk




In [5]:
!pip install streamlit transformers PyMuPDF sumy nltk pandas matplotlib reportlab cloudflared rake-nltk requests




In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
code = '''
import re
import pandas as pd
import PyPDF2
import docx
import os
import requests
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import matplotlib.pyplot as plt
from rake_nltk import Rake
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet

# ---------- File Extractors ----------
def extract_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                page_text = re.sub(r'(\\w+)-\\s*\\n(\\w+)', r'\\1\\2', page_text)
                page_text = re.sub(r'\\n', ' ', page_text)
                text += page_text + " "
    return text

def extract_from_docx(file_path):
    doc = docx.Document(file_path)
    text = " ".join([para.text for para in doc.paragraphs])
    return text

def extract_from_excel(file_path):
    df = pd.read_excel(file_path)
    text = " ".join(df.astype(str).fillna("").values.flatten())
    return text

def extract_from_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

# ---------- Summarizer ----------
def summarize_file(file_path, sentence_count=50):
    ext = os.path.splitext(file_path)[1].lower()
    raw_text = ""

    if ext == ".pdf":
        raw_text = extract_from_pdf(file_path)
    elif ext == ".docx":
        raw_text = extract_from_docx(file_path)
    elif ext in [".xls", ".xlsx"]:
        raw_text = extract_from_excel(file_path)
    elif ext == ".txt":
        raw_text = extract_from_text(file_path)
    else:
        raise ValueError("Unsupported file format!")

    parser = PlaintextParser.from_string(raw_text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentence_count)

    cleaned_summary = " ".join(str(sentence) for sentence in summary)
    cleaned_summary = re.sub(r'\\s+', ' ', cleaned_summary)

    return cleaned_summary, raw_text

# ---------- Keyword Extraction ----------
def extract_keywords(text, top_n=10):
    rake = Rake()
    rake.extract_keywords_from_text(text)
    keywords = rake.get_ranked_phrases()[:top_n]
    return keywords

# ---------- Related Works (Semantic Scholar API) ----------
def get_related_papers(keywords, limit=5):
    if not keywords:
        return ["No keywords available to search."]
    query = " ".join(keywords[:3])
    url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={query}&limit={limit}&fields=title,authors,url"
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            results = []
            for paper in data.get("data", []):
                title = paper.get("title", "No title")
                authors = ", ".join([a["name"] for a in paper.get("authors", [])]) if paper.get("authors") else "Unknown"
                link = paper.get("url", "No link")
                results.append(f"**{title}** — {authors} [Read here]({link})")
            return results if results else ["No related works found."]
        else:
            return [f"Error fetching related works (status {response.status_code})"]
    except Exception as e:
        return [f"Error: {str(e)}"]

# ---------- Q&A ----------
def answer_question(question, text, keywords):
    q = question.lower()
    if "conclusion" in q:
        return extract_conclusion(text)
    elif "reference" in q or "related work" in q or "other paper" in q:
        return get_related_papers(keywords)
    elif "graph" in q or "methodolog" in q:
        return "graph_request"
    else:
        sentences = text.split(". ")
        for s in sentences:
            if any(word in s.lower() for word in q.split()):
                return s
        return "Sorry, I could not find an exact answer. Please rephrase."

# ---------- Helpers ----------
def extract_conclusion(text):
    match = re.search(r'(Conclusion|Conclusions|Discussion)(.*)', text, re.IGNORECASE | re.DOTALL)
    return match.group(0)[:1500] if match else "No explicit conclusion found."

def extract_references(text):
    match = re.search(r'(References|Bibliography)(.*)', text, re.IGNORECASE | re.DOTALL)
    return match.group(0)[:2000] if match else "No references found."

def plot_methodologies(text):
    methods = ["CNN", "RNN", "Transformer", "BERT", "SVM", "LSTM"]
    counts = {m: text.count(m) for m in methods if text.count(m) > 0}
    if counts:
        fig, ax = plt.subplots()
        ax.bar(counts.keys(), counts.values())
        ax.set_title("Methodologies Frequency")
        return fig
    else:
        return None

# ---------- Report Generator ----------
def generate_pdf_report(summary, qa_list, keywords, graph_path=None, output_path="report.pdf"):
    doc = SimpleDocTemplate(output_path, pagesize=A4)
    styles = getSampleStyleSheet()
    story = []

    story.append(Paragraph("📄 Research Paper Report", styles["Title"]))
    story.append(Spacer(1, 12))
    story.append(Paragraph("🔹 Summary", styles["Heading2"]))
    story.append(Paragraph(summary, styles["Normal"]))
    story.append(Spacer(1, 12))

    story.append(Paragraph("🔹 Keywords", styles["Heading2"]))
    story.append(Paragraph(", ".join(keywords), styles["Normal"]))
    story.append(Spacer(1, 12))

    story.append(Paragraph("🔹 Q&A", styles["Heading2"]))
    for q, a in qa_list:
        story.append(Paragraph(f"<b>Q:</b> {q}", styles["Normal"]))
        if isinstance(a, list):
            for item in a:
                story.append(Paragraph(f"- {item}", styles["Normal"]))
        else:
            story.append(Paragraph(f"<b>A:</b> {a}", styles["Normal"]))
        story.append(Spacer(1, 6))

    if graph_path and os.path.exists(graph_path):
        story.append(Paragraph("🔹 Methodologies Graph", styles["Heading2"]))
        story.append(Image(graph_path, width=400, height=200))
        story.append(Spacer(1, 12))

    doc.build(story)
    return output_path
'''
with open("summarizer.py", "w") as f:
    f.write(code)


In [8]:
app_code = '''
import streamlit as st
from summarizer import summarize_file, answer_question, plot_methodologies, generate_pdf_report, extract_keywords
import os

st.set_page_config(page_title="AI Research Paper Summarizer + Q&A + Report Generator", layout="wide")

st.title("📄 AI Research Paper Summarizer")
st.write("Upload a research paper, get a summary, keywords, related works, Q&A, and a downloadable report.")

uploaded_file = st.file_uploader("Upload your file", type=["pdf","docx","txt","xls","xlsx"])

if uploaded_file is not None:
    with open(uploaded_file.name, "wb") as f:
        f.write(uploaded_file.getbuffer())

    if "summary" not in st.session_state:
        st.session_state.summary, st.session_state.raw_text = summarize_file(uploaded_file.name)
        st.session_state.keywords = extract_keywords(st.session_state.raw_text)
        st.session_state.qa_list = []

    st.subheader("📌 Summary")
    st.write(st.session_state.summary)

    st.subheader("📌 Keywords")
    st.write(", ".join(st.session_state.keywords))

    question = st.text_input("Ask a question about the paper (e.g., conclusion):")
    if question:
        answer = answer_question(question, st.session_state.raw_text, st.session_state.keywords)
        st.session_state.qa_list.append((question, answer))

        if answer == "graph_request":
            fig = plot_methodologies(st.session_state.raw_text)
            if fig:
                graph_path = "graph.png"
                fig.savefig(graph_path)
                st.image(graph_path)
                st.session_state.graph_path = graph_path
            else:
                st.write("No methodologies detected.")
        elif isinstance(answer, list):
            st.subheader("🔗 Related Works")
            for item in answer:
                st.markdown(item)
        else:
            st.subheader("Answer")
            st.write(answer)

    if st.button("📥 Generate PDF Report"):
        report_path = generate_pdf_report(
            st.session_state.summary,
            st.session_state.qa_list,
            st.session_state.keywords,
            graph_path=st.session_state.get("graph_path", None)
        )
        with open(report_path, "rb") as f:
            st.download_button("⬇️ Download Report", f, file_name="research_report.pdf")
'''
with open("app.py", "w") as f:
    f.write(app_code)


In [10]:
# Download cloudflared binary
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared

# Run Streamlit + Cloudflare tunnel
!streamlit run app.py --server.port 8501 & sleep 5 && ./cloudflared tunnel --url http://localhost:8501 --no-autoupdate



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://104.196.53.137:8501[0m
[0m
[90m2025-09-09T05:15:39Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-09-09T05:15:39Z[0m [32mINF[