<a href="https://colab.research.google.com/github/Abdodusoky/Company/blob/main/Nasa_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# space_bio_knowledge_engine_app.py
"""
Streamlit app: Build a Space Biology Knowledge Engine (FAISS replaced with sklearn cosine similarity)
- Loads dataset: /mnt/data/SB_publication_PMC.csv (assumed to be PubMed Central-style CSV)
- Preprocesses text (title, abstract, body)
- Builds embeddings with sentence-transformers
- Provides: search, chatbot (RAG), extractive summarizer, abstractive summarizer (OpenAI or local HF), interactive graphs (Plotly), topic modeling (TF-IDF + NMF)

How to run:
1. Create a virtualenv and install requirements:
   python -m venv venv
   source venv/bin/activate   # or venv\\Scripts\\activate on Windows
   pip install -r requirements.txt

2. Example requirements.txt (also included below in comments):
   streamlit
   pandas
   numpy
   scikit-learn
   sentence-transformers
   plotly
   nltk
   openai
   transformers
   tiktoken

3. Run:
   streamlit run space_bio_knowledge_engine_app.py

Notes:
- If you want to use OpenAI for generation/summarization/chat, set OPENAI_API_KEY in environment variables.
- If OpenAI is not available, the app will attempt to use local HuggingFace transformer models (may require large downloads).

This file is intentionally self-contained for a hackathon MVP. Replace models or tweak parameters for production.
"""

import os
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sentence_transformers import SentenceTransformer
import plotly.express as px
import nltk
import re
from sklearn.metrics.pairwise import cosine_similarity
import textwrap

# Optional imports for generation
try:
    import openai
except Exception:
    openai = None

try:
    from transformers import pipeline
except Exception:
    pipeline = None

nltk.download('punkt')

# ---------------------------
# Configuration / Constants
# ---------------------------
DATA_PATH = '/mnt/data/SB_publication_PMC.csv'  # user-provided CSV path
EMBEDDING_MODEL_NAME = 'all-mpnet-base-v2'  # sentence-transformers
EMBEDDING_DIM = 768  # mpnet dim
NUM_TOPICS = 8
TOP_K = 5

# ---------------------------
# Utility functions
# ---------------------------

def load_data(path=DATA_PATH, nrows=None):
    df = pd.read_csv(path, nrows=nrows)
    st.write(f"Loaded dataset: {path} -- {len(df)} rows")
    return df


def preprocess_row(row):
    # Combine useful fields; adjust field names to dataset columns
    parts = []
    for col in ['title', 'abstract', 'body', 'authors', 'journal', 'year']:
        if col in row and pd.notnull(row[col]):
            parts.append(str(row[col]))
    text = '\n'.join(parts)
    # basic cleaning
    text = re.sub(r"\s+", ' ', text)
    return text


def prepare_corpus(df, text_col='combined_text'):
    texts = df[text_col].fillna('').astype(str).tolist()
    return texts

# ---------------------------
# Embeddings
# ---------------------------
@st.cache_resource
def load_embedding_model(name=EMBEDDING_MODEL_NAME):
    model = SentenceTransformer(name)
    return model

@st.cache_resource
def build_embeddings(texts, model):
    embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    return embeddings

# ---------------------------
# Topic modeling (TF-IDF + NMF)
# ---------------------------
@st.cache_data
def make_topics(texts, n_topics=NUM_TOPICS, n_features=5000):
    tfidf = TfidfVectorizer(max_features=n_features, stop_words='english')
    X = tfidf.fit_transform(texts)
    nmf = NMF(n_components=n_topics, random_state=0)
    W = nmf.fit_transform(X)
    H = nmf.components_
    feature_names = tfidf.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(H):
        top_features = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        topics.append((topic_idx, top_features))
    return topics, W

# ---------------------------
# Summarization + Chat helpers
# ---------------------------

def extractive_answer(user_question, docs_texts, embeddings_model, doc_embeddings, top_k=TOP_K):
    q_emb = embeddings_model.encode([user_question], convert_to_numpy=True)
    sims = cosine_similarity(q_emb, doc_embeddings)[0]
    top_idx = sims.argsort()[::-1][:top_k]
    answers = [docs_texts[i] for i in top_idx]
    return answers, sims[top_idx]


def call_openai_completion(prompt, max_tokens=256, temperature=0.2):
    key = os.environ.get('OPENAI_API_KEY')
    if not key or openai is None:
        return None
    openai.api_key = key
    try:
        resp = openai.ChatCompletion.create(
            model='gpt-4o-mini' if 'gpt-4o-mini' in openai.Model.list() else 'gpt-4o',
            messages=[{'role':'user','content':prompt}],
            max_tokens=max_tokens,
            temperature=temperature,
        )
        return resp['choices'][0]['message']['content']
    except Exception as e:
        st.warning(f"OpenAI call failed: {e}")
        return None


def local_summarize(text, max_length=150):
    if pipeline is None:
        return None
    try:
        summarizer = pipeline('summarization', model='facebook/bart-large-cnn', truncation=True)
        out = summarizer(text, max_length=max_length, min_length=30, do_sample=False)
        return out[0]['summary_text']
    except Exception as e:
        st.warning(f"Local summarization failed: {e}")
        return None

# ---------------------------
# App UI
# ---------------------------

def main():
    st.set_page_config(layout='wide', page_title='Space Biology Knowledge Engine')
    st.title('🚀 Space Biology Knowledge Engine — Research explorer & chatbot')

    st.sidebar.header('Configuration')
    use_sample = st.sidebar.checkbox('Load sample subset (fast)', value=True)
    nrows = 200 if use_sample else None

    if not os.path.exists(DATA_PATH):
        st.error(f"Dataset not found at {DATA_PATH}. Please upload or place it there.")
        return

    df = load_data(DATA_PATH, nrows=nrows)

    # Create combined_text column
    if 'combined_text' not in df.columns:
        st.info('Combining columns into `combined_text` (title, abstract, body, authors)')
        df['combined_text'] = df.apply(preprocess_row, axis=1)

    # show basic dataset preview
    with st.expander('Dataset preview'):
        st.dataframe(df.head(10))

    # Topic modeling
    if st.sidebar.button('Build topics'):
        with st.spinner('Building topics (TF-IDF + NMF)...'):
            texts = prepare_corpus(df)
            topics, W = make_topics(texts, n_topics=NUM_TOPICS)
            st.subheader('Topics (TF-IDF + NMF)')
            for tid, words in topics:
                st.markdown(f"**Topic {tid}**: {', '.join(words)}")

    # Embeddings
    st.sidebar.header('Embeddings')
    if st.sidebar.button('Build embeddings'):
        with st.spinner('Loading embedding model...'):
            model = load_embedding_model()
            texts = prepare_corpus(df)
            st.info('Encoding texts (this may take a while)...')
            embeddings = build_embeddings(texts, model)
            st.session_state['embeddings'] = embeddings
            st.session_state['sbert_model'] = model
            st.success('Embeddings built and cached')

    # Quick search interface
    st.header('🔎 Search & Explore')
    query = st.text_input('Enter a search question or keywords')
    k = st.slider('Results to show', min_value=1, max_value=20, value=5)

    if st.button('Search') and query.strip():
        if 'sbert_model' not in st.session_state:
            st.warning('Please build embeddings first (sidebar).')
        else:
            model = st.session_state['sbert_model']
            embeddings = st.session_state['embeddings']
            q_emb = model.encode([query], convert_to_numpy=True)
            sims = cosine_similarity(q_emb, embeddings)[0]
            hits = sims.argsort()[::-1][:k]
            st.write(f'Found top {len(hits)} documents:')
            for rank, idx in enumerate(hits, start=1):
                score = sims[idx]
                row = df.iloc[idx]
                st.markdown(f"**{rank}. {row.get('title','(no title)')}** — score {score:.3f}")
                if 'abstract' in row and pd.notnull(row['abstract']):
                    st.write(row['abstract'][:800] + ('...' if len(str(row['abstract']))>800 else ''))
                st.write(f"[View full row] -> index {idx}")

    # Chatbot
    st.header('🤖 Research Chatbot (RAG)')
    user_q = st.text_area('Ask a research question about space biology (use the dataset)')
    bot_mode = st.radio('Answering mode', ['Retrieval + Extractive', 'Retrieval + Generative (OpenAI)', 'Local Generative (HF)'])

    if st.button('Ask') and user_q.strip():
        if 'sbert_model' not in st.session_state:
            st.warning('Please build embeddings first (sidebar).')
        else:
            model = st.session_state['sbert_model']
            embeddings = st.session_state['embeddings']
            texts = prepare_corpus(df)
            # retrieve top docs
            q_emb = model.encode([user_q], convert_to_numpy=True)
            sims = cosine_similarity(q_emb, embeddings)[0]
            hits = sims.argsort()[::-1][:TOP_K]
            retrieved_texts = [texts[i] for i in hits]
            st.subheader('Retrieved documents (short)')
            for i, t in enumerate(retrieved_texts, start=1):
                st.write(f"{i}. " + textwrap.shorten(t, width=400, placeholder='...'))

            if bot_mode == 'Retrieval + Extractive':
                answers, sims_top = extractive_answer(user_q, texts, model, embeddings, top_k=TOP_K)
                st.subheader('Extractive answers (top snippets)')
                for i, (a, s) in enumerate(zip(answers, sims_top), start=1):
                    st.markdown(f"**Snippet {i} (score {s:.3f})**")
                    st.write(textwrap.shorten(a, width=600, placeholder='...'))

            elif bot_mode == 'Retrieval + Generative (OpenAI)':
                prompt = 'You are an expert assistant for space biology research. Use the following retrieved documents as evidence, then answer the user question succinctly.\n\n'
                for j, doc in enumerate(retrieved_texts, start=1):
                    prompt += f"[DOC {j}] " + doc[:1000] + '\n\n'
                prompt += f"User question: {user_q}\n\nAnswer concisely, cite which DOC number you used."
                out = call_openai_completion(prompt)
                if out:
                    st.subheader('Generative answer (OpenAI)')
                    st.write(out)
                else:
                    st.warning('OpenAI not available or call failed. Consider local mode or extractive mode.')

            elif bot_mode == 'Local Generative (HF)':
                if pipeline is None:
                    st.warning('transformers not installed/available. Install `transformers` and required models.')
                else:
                    gen_pipe = pipeline('text-generation', model='gpt2', device=-1)
                    context = '\n\n'.join(retrieved_texts[:3])
                    prompt = f"Context: {context}\n\nQuestion: {user_q}\nAnswer:"
                    res = gen_pipe(prompt, max_length=256, do_sample=False)
                    st.subheader('Local generative answer')
                    st.write(res[0]['generated_text'])

    # Visualization
    st.header('📊 Interactive Research Visualizations')
    viz_choice = st.selectbox('Choose visualization', ['Publications per year', 'Top authors', 'Topic wordcloud-like list'])

    if viz_choice == 'Publications per year':
        if 'year' in df.columns:
            df_year = df.dropna(subset=['year'])
            df_year['year'] = df_year['year'].astype(int)
            counts = df_year.groupby('year').size().reset_index(name='count')
            fig = px.bar(counts, x='year', y='count', title='Publications per year')
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.info('No `year` column found in dataset')

    if viz_choice == 'Top authors':
        if 'authors' in df.columns:
            # naive split authors by semicolon or comma
            all_auth = df['authors'].dropna().astype(str).str.split('[;,]').explode().str.strip()
            top = all_auth.value_counts().head(20).reset_index()
            top.columns = ['author','count']
            fig = px.bar(top, x='author', y='count', title='Top authors', labels={'author':'Author', 'count':'Papers'})
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.info('No `authors` column found in dataset')

    if viz_choice == 'Topic wordcloud-like list':
        texts = prepare_corpus(df)
        topics, W = make_topics(texts, n_topics=NUM_TOPICS)
        cols = st.columns(2)
        for i, (tid, words) in enumerate(topics):
            with cols[i % 2]:
                st.markdown(f"### Topic {tid}")
                st.write(', '.join(words))

    # Export selected docs
    st.header('📁 Export')
    st.write('You can select rows to export as CSV (by index list)')
    indices_text = st.text_input('Enter comma-separated indices to export (e.g. 0,5,10)')
    if st.button('Export selected'):
        try:
            idxs = [int(x.strip()) for x in indices_text.split(',') if x.strip()!='']
            sub = df.iloc[idxs]
            csv = sub.to_csv(index=False).encode('utf-8')
            st.download_button('Download CSV', data=csv, file_name='selected_papers.csv', mime='text/csv')
        except Exception as e:
            st.error(f'Failed to export: {e}')

    st.sidebar.markdown('---')
    st.sidebar.write('Tips: Build embeddings once, then use search/chat. For high-quality generation, set OPENAI_API_KEY.')


if __name__ == '__main__':
    main()

# ---------------------------
# End of file
# ---------------------------


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2025-09-17 12:47:43.826 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
