## **ChatGPT as LLM model**
### TESTING THE PIPELINE WITH A DATASET OF QUESTIONS. PIPELINE HAS SOME CHANGES w.r.t. the format of result

In [1]:
# Install important libraries
!pip install langchain
!pip install openai
!pip install pylatexenc
!pip install langchain_experimental
!pip install sentence_transformers
!pip install chromadb
!pip install wolframalpha
from google.colab import userdata
OpenAIkey = userdata.get('OpenAIkey')
WolframAlphaKey = userdata.get('WolframAlphaKey')

In [2]:
import langchain
import openai

# from langchain_google_genai import ChatGoogleGenerativeAI
# from sec import GeminiKey
# llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GeminiKey)

key = OpenAIkey

from langchain.chat_models import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
llm = AzureChatOpenAI(azure_endpoint= 'https://amazebotopenai.openai.azure.com/', api_key = key, deployment_name= 'Chat', api_version='2023-05-15')

In [3]:
query_template = '''Solve this question:
{Question}\n and return the correct option number\n
(1) {Option_1}\n
(2) {Option_2}\n
(3) {Option_3}\n
(4) {Option_4}\n
Output the correct option number (1, 2, 3, or 4). I want the output to not exceed length 1.
Don't ask for any user-input.
A quick tip: Use wolfram-alpha tool provided to solve math questions. This tool is good for math.
Use retrieval_tool to extract context from knowledge base.
Let's think step-by-step to reach the final answer.'''

query_template = PromptTemplate(
    input_variables=["Question", "Option_1", "Option_2", "Option_3", "Option_4"],
    template=query_template
)

chain = LLMChain(llm=llm, prompt=query_template, output_key="correct_option")

In [None]:
from langchain.agents import AgentType, initialize_agent, load_tools, Tool
from langchain_experimental.utilities import PythonREPL
from langchain_experimental.pal_chain import PALChain
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

from google.colab import drive
drive.mount('/content/drive')

import os
os.environ["WOLFRAM_ALPHA_APPID"] = WolframAlphaKey

python_repl = PythonREPL()
pal_chain = PALChain.from_math_prompt(llm=llm)
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

persist_directory = '/content/drive/My Drive/Colab Notebooks/Langchain/docs/chroma/'
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
)

python_repl = Tool(
    name="python_repl",
    description="A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
    func=python_repl.run, handle_parsing_errors=True
)

pal_math = Tool(
    name="pal_math",
    description="A LLM Chain which helps to solve word problems.",
    func=pal_chain.run, handle_parsing_errors=True
)

retrieval_tool = Tool(
    name='RetrievalQA',
    description = 'This tool can be used to get context from the knowledge base of textbooks.',
    func=qa_chain.run, handle_parsing_errors=True
)

tools = load_tools(['llm-math', 'wolfram-alpha'], llm=llm)
tools.append(python_repl)
tools.append(pal_math)
tools.append(retrieval_tool)

agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, handle_parsing_errors=True)
# agent.run("Given equations for variable x, y, and z: x = y + 5, y = z - 3,  z = x * y. Solve for x, y, z.")
# agent({question:"Use 'retrieval_tool' to answer the following - The measured temperature on the Fahrenheit scale is 200 °F. What would the reading be on the Celsius scale?"})
# agent.run("Use 'retrieval_tool' to answer the following - The measured temperature on the Fahrenheit scale is 200 °F. What would the reading be on the Celsius scale?")

In [5]:
import pandas as pd
from pylatexenc.latex2text import LatexNodes2Text
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Langchain/2013_Paper1 - MCQS.csv')

def latex_to_text(i):
    data.loc[i, 'Question'] = LatexNodes2Text().latex_to_text(data.loc[i, 'Question'])
    data.loc[i, 'Option1'] = LatexNodes2Text().latex_to_text(data.loc[i, 'Option1'])
    data.loc[i, 'Option2'] = LatexNodes2Text().latex_to_text(data.loc[i, 'Option2'])
    data.loc[i, 'Option3'] = LatexNodes2Text().latex_to_text(data.loc[i, 'Option3'])
    data.loc[i, 'Option4'] = LatexNodes2Text().latex_to_text(data.loc[i, 'Option4'])

def predict(i):
    try:
        query_template.format(Option_1=data.loc[i, 'Option1'], Option_2=data.loc[i, 'Option2'], Option_3=data.loc[i, 'Option3'], Option_4=data.loc[i, 'Option4'], Question=data.loc[i, 'Question'])
        result = agent.invoke(query_template)
    except:
        return 'Error'
    else:
        return result['output']


In [6]:
# Implementing self-consistency with CoT
from statistics import mode

def evaluate(pred, i):
    if pred == 1:
        if data['Answer1'][i] == 1:
            return 1
    elif pred == 2:
        if data['Answer2'][i] == 1:
            return 1
    elif pred == 3:
        if data['Answer3'][i] == 1:
            return 1
    elif pred == 4:
        if data['Answer4'][i] == 1:
            return 1
    return 0

def solve(i):
    pred = predict(i)
    while(pred == 'Error' or len(pred) > 1):
        pred = predict(i)
        # Type conversion error may occur
        try:
          val = int(pred)
        except:
          pred = 'Error'
    pred = int(pred)
    return pred

def solve_section(start_row, last_row):
    # start_row denotes the row number where the questions for that particular section starts.
    # num_of_questions denote the number of questions in the section.
    score = 0
    i = start_row
    while(i <= last_row):
        latex_to_text(i)
        responses = []
        for iter in range(4):
            responses.append(solve(i))
        final_pred = mode(responses)
        score += evaluate(final_pred, i)
        print(score)
        i += 1
    return score

# In the output below we can see that there are many time we see some Errors. Such errors are ignored and the llm is called again to get the actual answer.
# The output as '3' or '2' is the option number selected. Output as 'Option 2' or 'Option_3' or '(3) Option_3' is ignored.
# We see the responses we get are [3,3,3,2]. Mode=3 which is the final answer.
# Self-Consistency helps to select the correct option.

In [None]:
def find_end_section(section, start_row, last_row_year):
    # len(data)-1 is the total number of questions
    while(start_row < last_row_year and data['Subject'][start_row] == section):
        start_row+=1
    return start_row-1

def find_end_year(year, start_row):
    while(start_row < len(data) and data['Year'][start_row] == year):
        start_row+=1
    return start_row-1

result= []

# Solving an year
start_row_year = 0
while(start_row_year < len(data)):
    year = data['Year'][start_row_year]
    last_row_year = find_end_year(year, start_row_year)
    # print(start_row_year, last_row_year)
    # Solving a section
    start_row_section = 0
    while(start_row_section < last_row_year):
        section=data['Subject'][start_row_section]
        last_row_section = find_end_section(section, start_row_section, last_row_year)
        print(start_row_section, last_row_section)
        score = solve_section(start_row_section, last_row_section)

        if(section=='Physics'):
            physics_score = score
        elif(section == 'Maths'):
            math_score = score
        elif(section == 'Chemistry'):
            chemistry_score = score
        print(f'Completed {section} section of year {year}')
        start_row_section = last_row_section+1 # Moving to next section

    total_num_of_questions = last_row_year-start_row_year+1
    # Adding result for an year
    result.append (
        {
        'Year': year,
        'Total Questions': total_num_of_questions,
        'Correct Questions': physics_score + chemistry_score + math_score,
        'Incorrect Questions': total_num_of_questions-score,
        'Physics': physics_score,
        'Chemistry': chemistry_score,
        'Math': math_score,
        'Marks Obtained': score*4
        }
    )

    result = pd.DataFrame(result)
    print(result)
    result.to_csv('Result.csv', index=False)
    start_row_year = last_row_year+1 # Moving to next Year questions

0 14






Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
Enter the correct option number: 
0
0

0
0
0
0
0
0
0
1
2
2
3
3
3
Completed Maths section of year 2013
15 29
0
0
What specific math problem would you like help with?
Please provide a specific math problem to be solved: 
0
0
0
0
0
0
0
1
2
3
3
4
4
Completed Physics section of year 2013
30 43
0
0
1
1
1
1
2
2
2
2
3
4
4
5
Completed Chemistry section of year 2013
   Year  Total Questions  Correct Questions  Incorrect Questions  Physics  \
0  2013               45                  5                   40        4   

   Chemistry  Math  Marks Obtained  
0          5     3              20  


### Correct Questions should be 12
### Incorrect Questions should be 33
### Code had a bug, resolved that now.
### Accuracy = 12/45 = 26.67%

## **Results for SC@8**

In [None]:
def solve_section(start_row, last_row):
    # start_row denotes the row number where the questions for that particular section starts.
    # num_of_questions denote the number of questions in the section.
    score = 0
    i = start_row
    while(i <= last_row):
        latex_to_text(i)
        responses = []
        for iter in range(8):
            responses.append(solve(i))
        final_pred = mode(responses)
        score += evaluate(final_pred, i)
        print(score)
        i += 1
    return score

def find_end_section(section, start_row, last_row_year):
    # len(data)-1 is the total number of questions
    while(start_row < last_row_year and data['Subject'][start_row] == section):
        start_row+=1
    return start_row-1

def find_end_year(year, start_row):
    while(start_row < len(data) and data['Year'][start_row] == year):
        start_row+=1
    return start_row-1


result= []

# Solving an year
start_row_year = 0
while(start_row_year < len(data)):
    year = data['Year'][start_row_year]
    last_row_year = find_end_year(year, start_row_year)
    # print(start_row_year, last_row_year)
    # Solving a section
    start_row_section = 0
    while(start_row_section < last_row_year):
        section=data['Subject'][start_row_section]
        last_row_section = find_end_section(section, start_row_section, last_row_year)
        print(start_row_section, last_row_section)
        score = solve_section(start_row_section, last_row_section)

        if(section=='Physics'):
            physics_score = score
        elif(section == 'Maths'):
            math_score = score
        elif(section == 'Chemistry'):
            chemistry_score = score
        print(f'Completed {section} section of year {year}')
        start_row_section = last_row_section+1 # Moving to next section

    total_num_of_questions = last_row_year-start_row_year+1
    # Adding result for an year
    result.append (
        {
        'Year': year,
        'Total Questions': total_num_of_questions,
        'Correct Questions': physics_score + chemistry_score + math_score,
        'Incorrect Questions': total_num_of_questions-score,
        'Physics': physics_score,
        'Chemistry': chemistry_score,
        'Math': math_score,
        'Marks Obtained': score*4
        }
    )

    result = pd.DataFrame(result)
    print(result)
    result.to_csv('/content/drive/My Drive/Colab Notebooks/Langchain/Result_SC8.csv', index=False)
    start_row_year = last_row_year+1 # Moving to next Year questions

0 14




0
0
0
1




