In [None]:
!pip install openai==0.28

In [None]:
pip install transformers sentence-transformers scikit-learn

In [None]:
from sentence_transformers import SentenceTransformer
import json
import re
import numpy as np
import os

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def preprocess_text(text):
    chunks = re.split(r"(?=\d{5})", text.strip())
    clean_chunks = [re.sub(r'\n+', ' ', chunk.strip()) for chunk in chunks if chunk.strip()]
    return clean_chunks

def load_and_preprocess(filepath):
    with open(filepath, 'r') as file:
        text = file.read()
    chunks = preprocess_text(text)
    return chunks

def convert_to_vectors(chunks):
    vectors = model.encode(chunks)
    return vectors

def save_vectors_to_json(chunks, vectors, output_json):
    data = [{'chunk': chunk, 'vector': vector.tolist()} for chunk, vector in zip(chunks, vectors)]
    with open(output_json, 'w') as json_file:
        json.dump(data, json_file)

chunks = load_and_preprocess('/content/Engineering.txt')
vectors = convert_to_vectors(chunks)

save_vectors_to_json(chunks, vectors, 'college_data_vectors.json')


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def load_vectors_from_json(input_json):
    with open(input_json, 'r') as json_file:
        data = json.load(json_file)
    chunks = [item['chunk'] for item in data]
    vectors = [np.array(item['vector']) for item in data]
    return chunks, np.array(vectors)

# Find the most similar chunk
def find_similar_chunk(query, chunks, vectors, top_n=5):
    # Convert the query to a vector
    query_vector = model.encode([query])[0]

    # Calculate cosine similarity between the query vector and all text vectors
    similarities = cosine_similarity([query_vector], vectors)[0]

    # Get the indices of the top N most similar chunks
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Return the most similar chunks
    similar_chunks = [(chunks[i], similarities[i]) for i in top_indices]
    return similar_chunks

chunks, vectors = load_vectors_from_json('college_data_vectors.json')

In [None]:
os.environ["OPENAI_API_KEY"] = "sk-GdA7RJDIKTQ3xW-1NdmctgWB_4wq6VGozLoom4OIZDT3BlbkFJRvG4FaTvqMRtoIk9hucASYIFRdlUxFjYfPkKoeKdgA"

In [None]:
import openai

def generate_text(prompt):
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

while :
  query = input()
  if query not in ['exit','Exit']:
    similar_chunks = find_similar_chunk(query, chunks, vectors)
    print(f"Query: {query}\n")

    prompt = str(similar_chunks)+"using this data answer this query"+query
    generated_text = generate_text(prompt)
    print(generated_text)