In [1]:
#libraries
'''
!pip install transformers==4.31.0 
!pip install sentence-transformers==2.2.2 
!pip accelerate==0.21.0 \
!pip xformers==0.0.20 \
'''

'\n!pip install transformers==4.31.0 \n!pip install sentence-transformers==2.2.2 \n!pip accelerate==0.21.0 !pip xformers==0.0.20 '

In [4]:
#specific version to avoid error with vector store
#!pip install chromadb==0.5.3
#!pip install datasets

In [5]:
#update transformers
#!pip install -U transformers

In [6]:
#loading training data for providing context to the model
import pandas as pd
train_data_df=pd.read_csv("C:/Users/USER/Documents/Projects/Instruction Dataset/LLM Training Data/train_data_split.csv")
## Convert the DataFrame to a format compatible with Hugging Face Datasets
train_data_df['Instruction'] = train_data_df['Instruction']+ ". Input Text: " + train_data_df['input_text'] + "\n\nExtracted Properties:\n\n"+ train_data_df['Output'].astype(str)
train_dataset_dict = {
    'instruction': train_data_df['Instruction'].tolist()
}

In [7]:
#creating context document from the list
docs=train_dataset_dict['instruction']

In [8]:
#initializing the embedding model
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
import torch
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder = '/data/base_models')



In [9]:
#do the embedding
embeddings = embedding_model.encode(docs)
#verify the embeddings
embeddings.shape

(1040, 768)

In [10]:
#indexing-run this only once
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="q-rag_llama")

In [11]:
#addding the train data
collection.add(
    embeddings = embeddings,
    documents=docs,
    ids= [str(i) for i in range(len(docs))]
)

In [12]:
#a function for retrieval  
#retrieving
def retrieve_vector_db(query, n_results=3):
    results = collection.query(
    query_embeddings = embedding_model.encode(query).tolist(),
    n_results=n_results
    )
    return results['documents']

In [13]:
#loading the abstracts
import pandas as pd
abstracts_path="C:/Users/USER/Documents/Projects/KG/Data/abstracts/abstracts.csv"
abstracts_df=pd.read_csv(abstracts_path)

In [14]:
#pass the abstracts to a list
abstracts=abstracts_df['Abstract'].tolist()
#number of abstracts
print("Number of Abstracts:",len(abstracts))

Number of Abstracts: 42


In [15]:
#print list of abstracts
#abstracts

In [16]:
#setting up the query
query="From the following sentence, please extract the heterostructure of the Quantum Cascade Semiconductor laser device. Print none if the value does not exist in the input text,"

In [17]:
#generating a list of prompt for ecah abstract for power extraction
prompts=[]
for abstract in abstracts:
    results=retrieve_vector_db(query)
    context=results[0]
    prompt = f'''
[INST]
Problem Definition: Extraction of quantum cascade laser properties from text entails extracting properties from a given text description. This should be done without providing any other additional information or explanations.

Example Sentences containing instructions for extracting properties, the input text and the corresponding extracted properties: {context}

Instruction: {query}

Input Text: {abstract}

[/INST]
'''
    prompts.append(prompt)

In [18]:
print("number of prompts:", len(prompts))

number of prompts: 42


In [19]:
#verifying the list of prompts generated for each row in the dataset
#display the prompt
#prompts[0]

In [20]:
#GPT
import openai
import os
from IPython.display import Markdown
api_key="OPEN_API_KEY"

In [21]:
import openai

# Set your API key securely
openai.api_key = api_key

def chatWithGPT4(user_text, print_output=False):
    try:
        # Creating text completions using the updated `ChatCompletion` class
        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",  # Easily switch between "gpt-3.5-turbo-0613", "gpt-4", "text-davinci-004", or others
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": user_text}],
            max_tokens=3000
        )
        text_output = response['choices'][0]['message']['content'] if response['choices'] else "No response generated."
        if print_output:
            print(text_output)
        return text_output
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [22]:
#getting the model responses in a list
heterostructure_properties=[]
for prompt in prompts:
    result=chatWithGPT4(prompt)
    heterostructure_properties.append(result)

In [23]:
#viweing the extracted properties
heterostructure_properties

['none',
 'GaAs/Al0.15Ga0.85As',
 'GaAs/Al0.15Ga0.85As',
 'none',
 'GaAs/Al0.25Ga0.75As',
 'GaAs/Al0.25Ga0.75As',
 'none',
 'none',
 'GaAs/AlAs',
 'GaAs/Al0.15Ga0.85As',
 'GaAs/Al0.17Ga0.83As',
 'GaAs/Al0.14Ga0.86As',
 'GaAs/AlGaAs',
 'In0.53Ga0.47As/GaAs0.51Sb0.49',
 'none',
 'none',
 'none',
 'InAs/AlAs0.16Sb0.84',
 'InGaAs/AlInGaAs',
 'GaAs/Al0.1Ga0.9As',
 'none',
 'GaAs/Al0.1Ga0.9As',
 'none',
 'InGaAs/GaAsSb',
 'Extracted Properties:\n\nnone',
 'none',
 'GaAs/AlAs',
 'GaAs/Al0.15Ga0.85As',
 'AlInGaAs',
 'In0.53Ga0.47As/GaAs0.51Sb0.49',
 'none',
 'none',
 'none',
 'GaAs/Al0.3Ga0.7As',
 'GaAs/Al0.15Ga0.85As',
 'none',
 'none',
 'none',
 'none',
 'Extracted Properties:\n\nnone',
 'GaN/AlGaN',
 'none']

In [24]:
#save the extracted properties in a csv file
import csv

# Name of the csv file to save the list
file_name = "heterostructure_properties.csv"

# Writing the list to a csv file with one column
with open(file_name, "w", newline="") as file:
    writer = csv.writer(file)
    for item in heterostructure_properties:
        writer.writerow([item])

print("List saved to", file_name)

List saved to heterostructure_properties.csv
