In [1]:
#libraries
'''
!pip install transformers==4.31.0 
!pip install sentence-transformers==2.2.2 
!pip accelerate==0.21.0 \
!pip einops==0.6.1 \
!pip langchain==0.0.240 \
!pip xformers==0.0.20 \
'''

'\n!pip install transformers==4.31.0 \n!pip install sentence-transformers==2.2.2 \n!pip accelerate==0.21.0 !pip einops==0.6.1 !pip langchain==0.0.240 !pip xformers==0.0.20 '

In [2]:
#specific version to avoid error with vector store
#!pip install chromadb==0.5.3
#!pip install datasets

In [3]:
#loading training data for providing context to the model
import pandas as pd
train_data_df=pd.read_csv("C:/Users/USER/Documents/Projects/Instruction Dataset/LLM Training Data/train_data_split.csv")
## Convert the DataFrame to a format compatible with Hugging Face Datasets
train_data_df['Instruction'] = train_data_df['Instruction']+ ". Input Text: " + train_data_df['input_text'] + "\n\nExtracted Properties:\n\n"+ train_data_df['Output'].astype(str)
train_dataset_dict = {
    'instruction': train_data_df['Instruction'].tolist()
}

In [4]:
#verify
#train_dataset_dict

In [5]:
#creating context document from the list
docs=train_dataset_dict['instruction']

In [6]:
#initializing the embedding model
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
import torch
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder = '/data/base_models')

In [7]:
#do the embedding
embeddings = embedding_model.encode(docs)

In [8]:
#verify the embeddings
embeddings.shape

(1040, 768)

In [9]:
#indexing-run this only once
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="q-rag_llama")

In [10]:
#addding the train data
collection.add(
    embeddings = embeddings,
    documents=docs,
    ids= [str(i) for i in range(len(docs))]
)

In [11]:
#retrieving
def retrieve_vector_db(query, n_results=3):
    results = collection.query(
    query_embeddings = embedding_model.encode(query).tolist(),
    n_results=n_results
    )
    return results['documents']

In [12]:
#loading test data for model evaluation
import pandas as pd
test_data_path="C:/Users/USER/Documents/Projects/Instruction Dataset/LLM Training Data/test_data_split.csv"

In [13]:
#processing the test data to add the queries
#defining the queries for the properties-we need to choose a query based on the input
query_temperature="working temperature of the quantum cascade laser"
query_power= "optical power of the quantum cascade laser"
query_frequency="lasing frequency of the quantum cascade laser"
query_hs="heterostructure of the quantum cascade laser"
query_design_type="design type of the quantum cascade laser"

In [14]:
#checking for the query keywords
def check_text(row, column_name):
    text = row[column_name].lower()
    if "working temperature" in text:
        return query_temperature
    elif "optical power" in text:
        return query_power
    elif "lasing frequency" in text:
        return query_frequency
    elif "query_hs" in text:
        return query_hs
    else:
        return query_design_type

In [15]:
#processing the data
def process_csv(input_filepath, output_filepath, column_name):
    # Load the CSV file
    df = pd.read_csv(input_filepath)

    # Check if the specified column exists
    if column_name in df.columns:
        # Apply the check_text function to the specified column and create a new column with the results
        df['queries'] = df.apply(check_text, axis=1, args=(column_name,))
    else:
        print(f"Column '{column_name}' does not exist in the CSV file.")
        return

    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_filepath, index=False)
    print(f"Processed file saved as '{output_filepath}'.")

In [16]:
#processing the test data
input_csv = test_data_path
output_csv = 'processed_test_data.csv'
column_to_check = 'Instruction'
process_csv(input_csv, output_csv, column_to_check)

Processed file saved as 'processed_test_data.csv'.


In [17]:
#loading the processed test data
test_data_df=pd.read_csv('processed_test_data.csv')

In [18]:
#putting the data in some order and renaming the columns
test_data_df=test_data_df[['queries', 'Instruction', 'input_text']]
test_data_df.rename(columns={'Instruction': 'instructions', 'input_text': 'texts'}, inplace=True)

In [19]:
#verifying the ordered data
#test_data_df

In [20]:
#we need to generate a list of prompts will be passed to the model.
#This is because we have to parse a prompt that contains the context and input text.
#the query is to fetch the context that will be used in every iteration.
#note that this process might take some time depending on the size of the evaluation dataset.
prompts=[]
for a,b,c in zip(test_data_df['queries'],test_data_df['instructions'],test_data_df['texts']):
    results=retrieve_vector_db(a)
    context=results[0]
    prompt = f'''
[INST]
Problem Definition: Extraction of quantum cascade laser properties from text entails extracting the properties from a given text description. This should be done without providing any other additional information or explanations.

Example Sentences containing instructions for extracting properties, the input text and the corresponding extracted properties: {context}

Instruction: {b}

Input Text: {c}

[/INST]
'''
    prompts.append(prompt)

In [21]:
#verifying the list of prompts generated for each row in the dataset
#display the prompt
#prompts

In [22]:
#verify the length
print(len(prompts))

130


In [23]:
#GPT
import openai
import os
from IPython.display import Markdown
api_key="OPENAI_API_KEY"

In [24]:
import openai

# Set your API key securely
openai.api_key = api_key

def chatWithGPT4(user_text, print_output=False):
    try:
        # Creating text completions using the updated `ChatCompletion` class
        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",  # Easily switch between "gpt-3.5-turbo-0613", "gpt-4", "text-davinci-004", or others
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": user_text}],
            max_tokens=3000
        )
        text_output = response['choices'][0]['message']['content'] if response['choices'] else "No response generated."
        if print_output:
            print(text_output)
        return text_output
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [25]:
#checking a sample prompt at a particular index in the list of prompts
prompts[0]

"\n[INST]\nProblem Definition: Extraction of quantum cascade laser properties from text entails extracting the properties from a given text description. This should be done without providing any other additional information or explanations.\n\nExample Sentences containing instructions for extracting properties, the input text and the corresponding extracted properties: ['From the following sentence, please extract the lasing frequency of the Quantum Cascade Semiconductor laser device. Print none if the value does not exist in the input text. Input Text: We noticed a shift towards higher frequencies from the middle to the outskirts of the wafer in our observation of three GaAs/AlAs heterostructure-based lasers with target frequencies of 3.36 and 3.92 THz.\\t\\t\\n\\nExtracted Properties:\\n\\n3.36 and 3.92 THz.', 'From the following sentence, please extract the lasing frequency of the Quantum Cascade Semiconductor laser device. Print none if the value does not exist in the input text. I

In [26]:
#getting the model responses in a list
responses=[]
for prompt in prompts:
    result=chatWithGPT4(prompt)
    responses.append(result)

In [27]:
#dispalying the responses
responses

['Extracted Properties:\n\n3.3 THz',
 'Extracted Properties:\n\n10 mW',
 'Extracted Properties:\n\n2.07 THz, 2.02 THz',
 'Extracted Properties:\n\n190 K, 125 K',
 'Extracted Properties:\n\n107 K',
 'Extracted Properties:\n\nGaAs/Al0.23Ga0.77As',
 'Extracted Properties:\n\nIn0.6Ga0.4As/InAs',
 'Extracted Properties:\n\n3.0 THz',
 'Extracted Properties:\n\n185 K',
 'Extracted Properties:\n\n325 K',
 'Extracted Properties:\n\nresonant-phonon',
 'Extracted Properties:\n\n195 K',
 'Extracted Properties:\n\n24 mW',
 'Extracted Properties:\n\nnone',
 'Extracted Properties:\n\n150 K, 110 K',
 'Extracted Properties:\n\nnone',
 'Extracted Properties:\n\n113 K in pulsed mode, 89 K in continuous wave (cw) mode',
 'Extracted Properties:\n\n70K',
 'Extracted Properties:\n\nLO phonon',
 'resonant-phonon',
 'Extracted Properties:\n\n182 K',
 'none',
 'GaAs/Al0.15Ga0.85As',
 'LO phonon',
 'Extracted Properties:\n\nLO-phonon-based depopulation',
 'Extracted Properties:\n\n3.5 THz',
 'Extracted Propertie

In [28]:
#writing the model responses to a csv file
import csv

# Name of the csv file to save the list
file_name = "qcl-RAG-GPT4-Turbo-responses.csv"

# Writing the list to a csv file with one column
with open(file_name, "w", newline="") as file:
    writer = csv.writer(file)
    for item in responses:
        writer.writerow([item])

print("List saved to", file_name)

List saved to qcl-RAG-GPT4-Turbo-responses.csv
