<a href="https://colab.research.google.com/github/AshmitKumar1110/Embbeding/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip install Streamlit



In [29]:
!pip install -q transformers faiss-cpu torch pandas


In [31]:
import streamlit as st
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import faiss
import numpy as np
import json

st.set_page_config(page_title="Code Search & Explanation", layout="centered")

# Title
st.title("🔍 Code Search & Explanation with CodeBERT + CodeT5")
st.markdown("Search your codebase using natural language or code snippets, and get explanations with CodeT5.")

# Load CodeBERT model and tokenizer
@st.cache_resource
def load_codebert():
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    model = RobertaModel.from_pretrained("microsoft/codebert-base")
    model.eval()
    return tokenizer, model

cb_tokenizer, cb_model = load_codebert()

# Load CodeT5 summarizer
@st.cache_resource
def load_codet5():
    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base")
    model.eval()
    return tokenizer, model

t5_tokenizer, t5_model = load_codet5()

# Load FAISS index and code snippets
index = faiss.read_index("codebert_embeddings.index")
with open("code_snippets.json") as f:
    code_snippets = json.load(f)

# Embedding function (CodeBERT)
def get_embedding(text):
    inputs = cb_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = cb_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Search code
def search(query, top_k=3):
    query_emb = get_embedding(query).reshape(1, -1)
    distances, indices = index.search(query_emb, top_k)
    results = []
    for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), 1):
        results.append({
            "rank": rank,
            "code": code_snippets[idx],
            "distance": float(dist)
        })
    return results

# Summarize code
def summarize_code(code):
    input_ids = t5_tokenizer("summarize: " + code, return_tensors="pt", truncation=True).input_ids
    with torch.no_grad():
        output_ids = t5_model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    return t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# UI: Query input
query_type = st.selectbox("Choose query type:", ["Natural Language", "Code Snippet"])
query = st.text_area("Enter your query:", height=120)

# Perform search
if query:
    st.info(f"Searching using {query_type.lower()}...")
    results = search(query, top_k=3)
    st.subheader("🔎 Top Matching Code Snippets:")
    for result in results:
        st.code(result["code"], language="python")
        st.caption(f"Rank #{result['rank']} • Distance: {result['distance']:.4f}")

    # Optional explanation
    if query_type == "Code Snippet":
        if st.button("🧠 Explain This Code with CodeT5"):
            explanation = summarize_code(query)
            st.subheader("🧠 Code Explanation:")
            st.success(explanation)




In [32]:
# Create dummy FAISS index and code snippets files for demonstration purposes
import faiss
import numpy as np
import json

# Create a dummy index
d = 768  # Dimension of the embeddings (CodeBERT base)
nb = 10  # Number of vectors
index = faiss.IndexFlatL2(d)
xb = np.random.random((nb, d)).astype('float32')
index.add(xb)
faiss.write_index(index, "codebert_embeddings.index")

# Create dummy code snippets
code_snippets = [
    "def greet(name):\\n  print(f'Hello, {name}!')",
    "for i in range(5):\\n  print(i)",
    "data = {'a': 1, 'b': 2}\\nprint(data['a'])",
    "class MyClass:\\n  def __init__(self, value):\\n    self.value = value",
    "result = 10 + 20\\nprint(result)",
    "if x > 0:\\n  print('Positive')",
    "def calculate_sum(a, b):\\n  return a + b",
    "my_list = [1, 2, 3]\\nprint(len(my_list))",
    "import os\\nprint(os.getcwd())",
    "try:\\n  x = 1 / 0\\nexcept ZeroDivisionError:\\n  print('Cannot divide by zero')"
]
with open("code_snippets.json", "w") as f:
    json.dump(code_snippets, f)

print("Dummy 'codebert_embeddings.index' and 'code_snippets.json' created.")

Dummy 'codebert_embeddings.index' and 'code_snippets.json' created.


In [33]:
# Embedding function (CodeBERT)
def get_embedding(text):
    inputs = cb_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = cb_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Search code
def search(query, top_k=3):
    query_emb = get_embedding(query).reshape(1, -1)
    distances, indices = index.search(query_emb, top_k)
    results = []
    for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), 1):
        results.append({
            "rank": rank,
            "code": code_snippets[idx],
            "distance": float(dist)
        })
    return results

# Summarize code
def summarize_code(code):
    input_ids = t5_tokenizer("summarize: " + code, return_tensors="pt", truncation=True).input_ids
    with torch.no_grad():
        output_ids = t5_model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    return t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# UI: Query input
query_type = st.selectbox("Choose query type:", ["Natural Language", "Code Snippet"])
query = st.text_area("Enter your query:", height=120)

# Perform search
if query:
    st.info(f"Searching using {query_type.lower()}...")
    results = search(query, top_k=3)
    st.subheader("🔎 Top Matching Code Snippets:")
    for result in results:
        st.code(result["code"], language="python")
        st.caption(f"Rank #{result['rank']} • Distance: {result['distance']:.4f}")

    # Optional explanation
    if query_type == "Code Snippet":
        if st.button("🧠 Explain This Code with CodeT5"):
            explanation = summarize_code(query)
            st.subheader("🧠 Code Explanation:")
            st.success(explanation)




In [34]:
!pip install -q pyngrok

In [35]:
from pyngrok import ngrok
import subprocess
import threading
import time
from google.colab import userdata
import os

# Get ngrok authtoken from Colab secrets
ngrok_auth_token = userdata.get('NGROK_AUTH_TOKEN')
if ngrok_auth_token:
    ngrok.set_auth_token(ngrok_auth_token)
else:
    print("NGROK_AUTH_TOKEN not found in Colab secrets. Please add it.")
    # You might want to stop execution here if the token is essential
    # exit()

# Save the Streamlit code to a file
streamlit_code = """
import streamlit as st
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import faiss
import numpy as np
import json

st.set_page_config(page_title="Code Search & Explanation", layout="centered")

# Title
st.title("🔍 Code Search & Explanation with CodeBERT + CodeT5")
st.markdown("Search your codebase using natural language or code snippets, and get explanations with CodeT5.")

# Load CodeBERT model and tokenizer
@st.cache_resource
def load_codebert():
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    model = RobertaModel.from_pretrained("microsoft/codebert-base")
    model.eval()
    return tokenizer, model

# Load CodeT5 summarizer
@st.cache_resource
def load_codet5():
    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base")
    model.eval()
    return tokenizer, model

# Load FAISS index and code snippets
# Assuming 'codebert_embeddings.index' and 'code_snippets.json' are already created
try:
    index = faiss.read_index("codebert_embeddings.index")
    with open("code_snippets.json") as f:
        code_snippets = json.load(f)
except FileNotFoundError:
    st.error("Required data files (codebert_embeddings.index, code_snippets.json) not found. Please run the cell to create dummy files.")
    st.stop()


# Embedding function (CodeBERT)
def get_embedding(text):
    inputs = cb_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = cb_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Search code
def search(query, top_k=3):
    # Ensure models and index are loaded before use
    global cb_tokenizer, cb_model, index, code_snippets
    try:
        cb_tokenizer, cb_model = load_codebert()
    except Exception as e:
        st.error(f"Error loading CodeBERT model: {e}")
        st.stop()

    try:
        query_emb = get_embedding(query).reshape(1, -1)
        distances, indices = index.search(query_emb, top_k)
        results = []
        for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), 1):
            results.append({
                "rank": rank,
                "code": code_snippets[idx],
                "distance": float(dist)
            })
        return results
    except Exception as e:
        st.error(f"Error during search: {e}")
        st.stop()


# Summarize code
def summarize_code(code):
    # Ensure CodeT5 model is loaded before use
    global t5_tokenizer, t5_model
    try:
        t5_tokenizer, t5_model = load_codet5()
    except Exception as e:
        st.error(f"Error loading CodeT5 model: {e}")
        st.stop()

    try:
        input_ids = t5_tokenizer("summarize: " + code, return_tensors="pt", truncation=True).input_ids
        with torch.no_grad():
            output_ids = t5_model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
        return t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    except Exception as e:
        st.error(f"Error during summarization: {e}")
        st.stop()


# UI: Query input
query_type = st.selectbox("Choose query type:", ["Natural Language", "Code Snippet"])
query = st.text_area("Enter your query:", height=120)

# Perform search
if query:
    st.info(f"Searching using {query_type.lower()}...")
    results = search(query, top_k=3)
    st.subheader("🔎 Top Matching Code Snippets:")
    for result in results:
        st.code(result["code"], language="python")
        st.caption(f"Rank #{result['rank']} • Distance: {result['distance']:.4f}")

    # Optional explanation
    if query_type == "Code Snippet":
        if st.button("🧠 Explain This Code with CodeT5"):
            explanation = summarize_code(query)
            st.subheader("🧠 Code Explanation:")
            st.success(explanation)

"""

with open("app.py", "w") as f:
    f.write(streamlit_code)

# Start ngrok tunnel
public_url = ngrok.connect("8501")
print(f"⚡️ ngrok tunnel is live at {public_url}")

# Function to run streamlit in a separate thread
def run_streamlit():
    try:
        # Use subprocess.Popen to keep the process running in the background
        streamlit_process = subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501"])
        print(f"Streamlit process launched with PID: {streamlit_process.pid}")
        streamlit_process.wait() # Wait for the process to finish if it exits
    except FileNotFoundError:
        print("Error: streamlit command not found. Make sure Streamlit is installed.")
    except Exception as e:
        print(f"An error occurred while running Streamlit: {e}")

# Start streamlit in a new thread
streamlit_thread = threading.Thread(target=run_streamlit)
streamlit_thread.start()

# Add a small delay to allow Streamlit to start
time.sleep(5)

# Keep the main thread alive to prevent Colab from stopping the background process
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Stopping Streamlit and ngrok...")
    ngrok.disconnect(public_url)
    streamlit_thread.join()

⚡️ ngrok tunnel is live at NgrokTunnel: "https://a74cfede3feb.ngrok-free.app" -> "http://localhost:8501"
Streamlit process launched with PID: 20434




Stopping Streamlit and ngrok...


PyngrokNgrokURLError: ngrok client exception, URLError: [Errno 111] Connection refused

# Task
Explain the error in the provided Python code, fix it, and provide a step-by-step process for running the corrected code, including how to stop the server.

## Ensure ngrok token is in colab secrets

### Subtask:
Verify that your ngrok authentication token is added to Colab's "Secrets" tab with the name `NGROK_AUTH_TOKEN`.


## Stop existing ngrok sessions

### Subtask:
Close any other active ngrok tunnels you might have running.


## Run setup cells

### Subtask:
Execute the cells that install necessary libraries (`Streamlit`, `transformers`, `faiss-cpu`, `torch`, `pandas`, `pyngrok`) and create the dummy data files (`codebert_embeddings.index`, `code_snippets.json`).


**Reasoning**:
The first step is to ensure all necessary libraries are installed. The provided notebook history shows cells that perform these installations. I will execute the cell that installs Streamlit, the cell that installs transformers, faiss-cpu, torch, and pandas, and the cell that installs pyngrok.



In [None]:
!pip install Streamlit
!pip install -q transformers faiss-cpu torch pandas
!pip install -q pyngrok

**Reasoning**:
The next step is to create the dummy data files required by the Streamlit application. The provided notebook history includes a cell that generates a dummy FAISS index and code snippets JSON file. I will execute this cell.



In [37]:
# Create dummy FAISS index and code snippets files for demonstration purposes
import faiss
import numpy as np
import json

# Create a dummy index
d = 768  # Dimension of the embeddings (CodeBERT base)
nb = 10  # Number of vectors
index = faiss.IndexFlatL2(d)
xb = np.random.random((nb, d)).astype('float32')
index.add(xb)
faiss.write_index(index, "codebert_embeddings.index")

# Create dummy code snippets
code_snippets = [
    "def greet(name):\\n  print(f'Hello, {name}!')",
    "for i in range(5):\\n  print(i)",
    "data = {'a': 1, 'b': 2}\\nprint(data['a'])",
    "class MyClass:\\n  def __init__(self, value):\\n    self.value = value",
    "result = 10 + 20\\nprint(result)",
    "if x > 0:\\n  print('Positive')",
    "def calculate_sum(a, b):\\n  return a + b",
    "my_list = [1, 2, 3]\\nprint(len(my_list))",
    "import os\\nprint(os.getcwd())",
    "try:\\n  x = 1 / 0\\nexcept ZeroDivisionError:\\n  print('Cannot divide by zero')"
]
with open("code_snippets.json", "w") as f:
    json.dump(code_snippets, f)

print("Dummy 'codebert_embeddings.index' and 'code_snippets.json' created.")

Dummy 'codebert_embeddings.index' and 'code_snippets.json' created.


## Run the streamlit and ngrok cell

### Subtask:
Execute the cell that saves the Streamlit code to `app.py`, sets up the ngrok tunnel, and starts the Streamlit application in a background thread.


**Reasoning**:
The subtask requires executing the cell that saves the Streamlit code to `app.py`, sets up the ngrok tunnel, and starts the Streamlit application. I will execute the provided code block to achieve this.



In [40]:
from pyngrok import ngrok
import subprocess
import threading
import time
from google.colab import userdata
import os

# Get ngrok authtoken from Colab secrets
ngrok_auth_token = userdata.get('NGROK_AUTH_TOKEN')
if ngrok_auth_token:
    ngrok.set_auth_token(ngrok_auth_token)
else:
    print("NGROK_AUTH_TOKEN not found in Colab secrets. Please add it.")
    # You might want to stop execution here if the token is essential
    # exit()

# Save the Streamlit code to a file
streamlit_code = """
import streamlit as st
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import faiss
import numpy as np
import json

st.set_page_config(page_title="Code Search & Explanation", layout="centered")

# Title
st.title("🔍 Code Search & Explanation with CodeBERT + CodeT5")
st.markdown("Search your codebase using natural language or code snippets, and get explanations with CodeT5.")

# Load CodeBERT model and tokenizer
@st.cache_resource
def load_codebert():
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    model = RobertaModel.from_pretrained("microsoft/codebert-base")
    model.eval()
    return tokenizer, model

# Load CodeT5 summarizer
@st.cache_resource
def load_codet5():
    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base")
    model.eval()
    return tokenizer, model

# Load FAISS index and code snippets
# Assuming 'codebert_embeddings.index' and 'code_snippets.json' are already created
try:
    index = faiss.read_index("codebert_embeddings.index")
    with open("code_snippets.json") as f:
        code_snippets = json.load(f)
except FileNotFoundError:
    st.error("Required data files (codebert_embeddings.index, code_snippets.json) not found. Please run the cell to create dummy files.")
    st.stop()


# Embedding function (CodeBERT)
def get_embedding(text):
    inputs = cb_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = cb_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Search code
def search(query, top_k=3):
    # Ensure models and index are loaded before use
    global cb_tokenizer, cb_model, index, code_snippets
    try:
        cb_tokenizer, cb_model = load_codebert()
    except Exception as e:
        st.error(f"Error loading CodeBERT model: {e}")
        st.stop()

    try:
        query_emb = get_embedding(query).reshape(1, -1)
        distances, indices = index.search(query_emb, top_k)
        results = []
        for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), 1):
            results.append({
                "rank": rank,
                "code": code_snippets[idx],
                "distance": float(dist)
            })
        return results
    except Exception as e:
        st.error(f"Error during search: {e}")
        st.stop()


# Summarize code
def summarize_code(code):
    # Ensure CodeT5 model is loaded before use
    global t5_tokenizer, t5_model
    try:
        t5_tokenizer, t5_model = load_codet5()
    except Exception as e:
        st.error(f"Error loading CodeT5 model: {e}")
        st.stop()

    try:
        input_ids = t5_tokenizer("summarize: " + code, return_tensors="pt", truncation=True).input_ids
        with torch.no_grad():
            output_ids = t5_model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
        return t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    except Exception as e:
        st.error(f"Error during summarization: {e}")
        st.stop()


# UI: Query input
query_type = st.selectbox("Choose query type:", ["Natural Language", "Code Snippet"])
query = st.text_area("Enter your query:", height=120)

# Perform search
if query:
    st.info(f"Searching using {query_type.lower()}...")
    results = search(query, top_k=3)
    st.subheader("🔎 Top Matching Code Snippets:")
    for result in results:
        st.code(result["code"], language="python")
        st.caption(f"Rank #{result['rank']} • Distance: {result['distance']:.4f}")

    # Optional explanation
    if query_type == "Code Snippet":
        if st.button("🧠 Explain This Code with CodeT5"):
            explanation = summarize_code(query)
            st.subheader("🧠 Code Explanation:")
            st.success(explanation)

"""

with open("app.py", "w") as f:
    f.write(streamlit_code)

# Start ngrok tunnel
public_url = ngrok.connect("8501")
print(f"⚡️ ngrok tunnel is live at {public_url}")

# Function to run streamlit in a separate thread
def run_streamlit():
    try:
        # Use subprocess.Popen to keep the process running in the background
        streamlit_process = subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501"])
        print(f"Streamlit process launched with PID: {streamlit_process.pid}")
        streamlit_process.wait() # Wait for the process to finish if it exits
    except FileNotFoundError:
        print("Error: streamlit command not found. Make sure Streamlit is installed.")
    except Exception as e:
        print(f"An error occurred while running Streamlit: {e}")

# Start streamlit in a new thread
streamlit_thread = threading.Thread(target=run_streamlit)
streamlit_thread.start()

# Add a small delay to allow Streamlit to start
time.sleep(5)

# Keep the main thread alive to prevent Colab from stopping the background process
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Stopping Streamlit and ngrok...")
    ngrok.disconnect(public_url)
    streamlit_thread.join()

⚡️ ngrok tunnel is live at NgrokTunnel: "https://84dba41ddba8.ngrok-free.app" -> "http://localhost:8501"
Streamlit process launched with PID: 23431




Stopping Streamlit and ngrok...


ConnectionResetError: [Errno 104] Connection reset by peer

**Reasoning**:
The previous execution of the cell failed with a KeyboardInterrupt and a ConnectionResetError during the cleanup phase (`ngrok.disconnect`). The core task of starting the Streamlit application and ngrok tunnel was likely attempted. Since the goal is to execute the cell to start the app and tunnel, I will re-execute the same code block. The errors encountered during the previous execution are related to interrupting the running process, not the initial setup and launch.

