In [1]:
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the CSV file
data = pd.read_csv('python_code.csv')

# Load the question and solution columns
questions = data['question'].tolist()
solutions = data['solution'].tolist()

# Step 1: Encode the questions into embeddings using a sentence transformer
embedder_model = SentenceTransformer('all-mpnet-base-v2')
question_embeddings = embedder_model.encode(questions, convert_to_tensor=True)

# Step 2: Create FAISS index for retrieval
d = question_embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(d)  # L2 distance-based index
index.add(np.array(question_embeddings.cpu()))

# Set a threshold for L2 distance (you can adjust this based on your data)
SIMILARITY_THRESHOLD = 0.5

# Function to retrieve the nearest question based on input
def retrieve_answer(input_question):
    # Encode the input question
    input_embedding = embedder_model.encode([input_question], convert_to_tensor=True)
    input_embedding = np.array(input_embedding.cpu())
    
    # Retrieve the nearest question
    D, I = index.search(input_embedding, k=1)  # k=1 for the top match
    distance = D[0][0]
    
    # Check if the distance is below the similarity threshold
    if distance < SIMILARITY_THRESHOLD:
        return solutions[I[0][0]], questions[I[0][0]]  # Return solution and matched question
    else:
        return None, None  # Return None if no close match is found

# Step 3: Load CodeT5 for code generation
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large-ntp-py")
codet5_model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-large-ntp-py")

# Function to generate code using CodeT5
def generate_code(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    generated_tokens = codet5_model.generate(**inputs, max_length=200)
    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

# Step 4: Combine Retrieval and Generation (RAG)
def rag_generate(input_question):
    # Step 4.1: Retrieve the most similar question and solution
    retrieved_solution, matched_question = retrieve_answer(input_question)

    if retrieved_solution is not None:
        # Step 4.2: Use the retrieved solution as part of the prompt to generate refined code
        prompt = f"# Input Question: {input_question}\n# Retrieved Solution: {retrieved_solution}\n"
        generated_code = generate_code(prompt)
        
        return {
            'input_question': input_question,
            'matched_question': matched_question,
            'retrieved_solution': retrieved_solution,
            'generated_code': generated_code
        }
    else:
        return {
            'input_question': input_question,
            'error': "No relevant questions found."
        }

# Test with a new question
new_question = "Program to find LCM?"
result = rag_generate(new_question)

if 'error' not in result:
    print(f"Input Question: {result['input_question']}")
    print(f"Matched Question: {result['matched_question']}")
    print(f"Retrieved Solution: {result['retrieved_solution']}")
    print(f"Generated Code:\n{result['generated_code']}")
else:
    print(result['error'])


  from tqdm.autonotebook import tqdm, trange







Input Question: Program to find LCM?
Matched Question: 55 write a  program to find LCM

Retrieved Solution: 
def lcm(x, y):  
   if x > y:  
       greater = x  
   else:  
       greater = y  
   while(True):  
       if((greater % x == 0) and (greater % y == 0)):  
           lcm = greater  
           break  
       greater += 1  
   return lcm  
  
  
num1 = int(input("Enter first number: "))  
num2 = int(input("Enter second number: "))  
print("The L.C.M. of", num1,"and", num2,"is", lcm(num1, num2)) 


Generated Code:
# Output Question: Program to find LCM?
# Retrieved Solution: 
def lcm(x, y):  
   if x > y:  
       greater = x  
   else:  
       greater = y  
   while(True):  
       if((greater % x == 0) and (greater % y == 0)):  
           lcm = greater  
           break  
       greater += 1  
   return lcm  
  
num1 = int(input("Enter first number: "))  
num2 = int(input("Enter second number: "))  
print("The L.C.M. of", num1,"and", num2,"is", lcm(num1, num2))  



In [1]:
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Check if GPU (CUDA) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the CSV file
data = pd.read_csv('python_code.csv')

# Load the question and solution columns
questions = data['question'].tolist()
solutions = data['solution'].tolist()

# Step 1: Encode the questions into embeddings using a sentence transformer
embedder_model = SentenceTransformer('all-mpnet-base-v2')
question_embeddings = embedder_model.encode(questions, convert_to_tensor=True)

# Step 2: Create FAISS index for retrieval
d = question_embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(d)  # L2 distance-based index
index.add(np.array(question_embeddings.cpu()))

# Set a threshold for L2 distance (you can adjust this based on your data)
SIMILARITY_THRESHOLD = 0.5

# Function to retrieve the nearest question based on input
def retrieve_answer(input_question):
    # Encode the input question
    input_embedding = embedder_model.encode([input_question], convert_to_tensor=True)
    input_embedding = np.array(input_embedding.cpu())
    
    # Retrieve the nearest question
    D, I = index.search(input_embedding, k=1)  # k=1 for the top match
    distance = D[0][0]
    
    # Check if the distance is below the similarity threshold
    if distance < SIMILARITY_THRESHOLD:
        return solutions[I[0][0]], questions[I[0][0]]  # Return solution and matched question
    else:
        return None, None  # Return None if no close match is found

# Step 3: Load the quantized CodeT5 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large-ntp-py")
codet5_model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-large-ntp-py")

# Apply dynamic quantization to reduce memory usage
quantized_model = torch.quantization.quantize_dynamic(
    codet5_model, {torch.nn.Linear}, dtype=torch.qint8
).to(device)  # Move quantized model to GPU if available

# Function to generate code using the quantized CodeT5 model
def generate_code(prompt):
    # Tokenize input prompt and move to the correct device (GPU or CPU)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate tokens
    generated_tokens = quantized_model.generate(**inputs, max_length=200)
    
    # Decode generated tokens and return code
    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

# Step 4: Combine Retrieval and Generation (RAG)
def rag_generate(input_question):
    # Step 4.1: Retrieve the most similar question and solution
    retrieved_solution, matched_question = retrieve_answer(input_question)

    if retrieved_solution is not None:
        # Step 4.2: Use the retrieved solution as part of the prompt to generate refined code
        prompt = f"# Input Question: {input_question}\n# Retrieved Solution: {retrieved_solution}\n"
        generated_code = generate_code(prompt)
        
        return {
            'input_question': input_question,
            'matched_question': matched_question,
            'retrieved_solution': retrieved_solution,
            'generated_code': generated_code
        }
    else:
        return {
            'input_question': input_question,
            'error': "No relevant questions found."
        }

# Test with a new question
new_question = "Program to find LCM?"
result = rag_generate(new_question)

# Print the results
if 'error' not in result:
    print(f"Input Question: {result['input_question']}")
    print(f"Matched Question: {result['matched_question']}")
    print(f"Retrieved Solution: {result['retrieved_solution']}")
    print(f"Generated Code:\n{result['generated_code']}")
else:
    print(result['error'])


  from tqdm.autonotebook import tqdm, trange



Using device: cpu




: 