In [68]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
import json
import pandas as pd
import time
from extracted_questions import Physics, Chemistry, Maths 
import os


In [69]:
physics_topics = ["Vectors","Types of Forces","Newton’s Laws of Motion","Friction","Basics of calculus","1 D Kinematics","2 D Kinematics","Circular Motion","Work, Power and Energy","Center of Mass","Conservation of Momentum","Rotation","Fluid Statics and Dynamics","Properties of Solids","Surface Tension and Viscosity","Simple Harmonic Motion","Combination of SHM","Damping and Resonance","Waves on a string","Sound Waves","Light Waves and Interference","Ray Optics","Thermal Expansion and Calorimetry","Kinetic Theory of Gases","1st Law of Thermodynamics & specific heats","Heat transfer","Electrostatics","Dielectrics and Capacitors","Current Electricity","Magnetism","Electromagnetic Induction","Alternating Current","Modern Physics","Gravitation"]

maths_topics = ["Number System", "Factors and Indices","Sets","Progressions and Series","Quadratic Equations & Theory of Equations","Logarithms","Trigonometric Identities and Equations","Determinants and Matrices","Straight Lines and Pair of Lines","Circles","Permutations and Combinations","Binomial Theorem","Complex Numbers","Solution of Triangles","Functions","Inverse Trigonometry","Limits and Continuity","Derivatives and Method of Differentiation","Application of Derivatives","Indefinite Integration","Definite Integration","Area Under the Curves","Differential Equations","Vectors and 3D Geometry","Probability","Parabola","Ellipse","Hyperbola"
]

chemistry_topics =["Periodic Table","Chemical Bonding","Atomic Structure","Mole Concept","Gaseous State","Thermodynamics","Thermochemistry","Chemical Equilibrium","Ionic Equilibrium","Redox Reactions","s-block Elements","Hydrogen and Related Compounds","Nomenclature","Solid State","Boron Family","Structural and Geometrical Isomerism","Liquid Solutions and Colligative Properties","Chemical Kinetics","Surface Chemistry","Carbon Family","Metallurgy","d-block elements","f-block elements","General Organic Chemistry","Electrochemistry","Reaction Mechanisms","Tautomerism","Optical Isomerism","Alkanes","Alkenes","Alkynes","Arenes","Alkyl and Aryl Halides","Alcohols, Phenols and Ethers","Aldehydes and Ketones","Nitrogen Family","Amines","Carboxylic Acid and Derivatives","Qualitative Analysis","Coordination compounds","Biomolecules and Polymers","16, 17 and 18 family","Practical Organic Chemistry"
]
maths_questions = Maths
chemistry_questions = Chemistry
physics_questions = Physics




In [70]:

# accurate_prompt = strict_prompt = ChatPromptTemplate.from_template("""
# You are an expert classification assistant.

# Task:
# - Read the question and the given answer carefully.
# - Use both the question and the provided answer to identify which topics are involved.
# - Do not re-derive or calculate the answer; only use it to understand the concept being tested.
# - Output a single valid JSON object with keys as category names and values as floats.
# - The sum of all numeric values must be exactly 100.0.
# - If the question clearly belongs to one category, assign 100.0.
# - If it belongs to multiple categories, split 100.0 according to the weightage of the topics.
# - Do NOT output anything other than the JSON object.
# - Only give non-zero results in the JSON object.
# - If you are unsure about the exact topic, assign the closest matching topic from the list.

# Categories:
# {topics}

# QUESTION:
# {questions}

# ANSWER:
# {answer}

# OUTPUT (JSON only):
# """)


accurate_prompt = strict_prompt = ChatPromptTemplate.from_template("""
SYSTEM:
You are a classification assistant. You MUST output ONLY a single valid JSON object and NOTHING else.
Use only the exact category names listed below (exact spelling). Do not invent or alter category names.

Categories:
{topics}

RULES (must follow exactly):
1) Output only ONE valid JSON object. No explanations, no extra text, no backticks.
2) Use ONLY keys from the categories above (exact spellings).
3) Numeric values must be integers and sum exactly to 100 (e.g., 100, 50, 25).
4) If a question clearly belongs to a single category, assign 100 to that category.
5) If a question involves multiple topics, divide the percentages according to the **relevance or weightage** of each topic. Ensure the sum is exactly 100.
6) classify based on the answer, not just the question and if you cannot solve skip the question.
7) properly read and understand the question and answer before classifying.
8) only add non zero results to the JSON object.

EXAMPLES OF VALID OUTPUT:

# Single-topic question:
{{"Alkenes": 100}}

# Multi-topic question:
{{"Reaction Mechanisms": 70, "Optical Isomerism": 30}}

# Another multi-topic example:
{{"Coordination compounds": 60, "Chemical Bonding": 40}}

QUESTION: {questions}



OUTPUT (ONLY the valid JSON object):
""")


In [71]:
# model1 = ChatOllama(model="llama3:8b", temperature=0)
# model2 = ChatOllama(model="deepseek-r1:8b", temperature=0,stop=["}"]) not working
# model3 = ChatOllama(model="qwen3:latest",temperature=0)
# model4 = ChatOllama(model="mistral:7b",temperature=0)
# model5 = ChatOllama(model="llama2:13b", temperature=0)
model6 = ChatOllama(model="gpt-oss:20b",temperature=0)
# model7 = ChatOllama(model="gemma:latest",temperature=0)
# model8 = ChatOllama(model="qwen:32b", temperature=0) #18gb /
# model9 = ChatOllama(model="llama-pro:8b-instruct-fp16", temperature=0) #16gb not done
# model10 = ChatOllama(model="llama3.1:70b", temperature=0) #42gb
# model11 = ChatOllama(model="gemma2:27b", temperature=0) #15gb
# model12 = ChatOllama(model="qwen3:14b", temperature=0)#10gb not done
# model13 = ChatOllama(model="qwen2.5-coder:32b", temperature=0) #19gb
# model14 = ChatOllama(model="llama2:70b", temperature=0) #40gb
model15 = ChatOllama(model="phi4:latest", temperature=0)
# model16 = ChatOllama(model="magistral:24b", temperature=0)
# model11,model9,model15
models =[model6,model15]  # 10gb
models

[ChatOllama(model='gpt-oss:20b', temperature=0.0),
 ChatOllama(model='phi4:latest', temperature=0.0)]

In [72]:
def get_topic_vector(question,topic_string,model):
    """Get the topic vector for a single question"""
    try:
        chain = accurate_prompt | model
        response = chain.invoke({"questions": question ,"topics":topic_string})
        response_text = response.content.strip()
        json_start = response_text.find('{')
        json_end = response_text.rfind('}') + 1
        json_str = response_text[json_start:json_end]       
        return json.loads(json_str)
    except Exception as e:
        print(f"Error: {e}")
        return {"Error": 100.0}
    

In [73]:

# all_results = []
# for i, q in enumerate(maths_questions):
#     print(f"Processing question {i+1}...")
#     if i%10 == 0 and i != 0:
#         time.sleep(2)  # Pause for 2 seconds every 10 questions to avoid rate limits
#     if i == 10:
#         break
#     topic_data = get_topic_vector(q,)
#     all_results.append({
#         "question_id": i+1,
#         "question": q,
#         "topic_vector": topic_data
#     })

In [74]:
# all_results_df_maths = pd.DataFrame(all_results)
# all_results_df_maths.head(5)
# # all_results_df_maths.to_excel("mistral:7b/maths_ollama.xlsx", index=False)


In [75]:
# all_results

In [76]:
physics_questions

['Q.1 The frequency of revolution of a charged particle in a uniform B-field does not depend on: (A) Magnetic field (B) Mass (C) Charge (D) Velocity',
 'Q.2 A straight conductor of length 0.2 m moves at 3 m/s perpendicular to B = 0.5 T. The induced emf is: (A) 0.20 V (B) 0.10 V (C) 0.05 V (D) 0.30 V',
 'Q.3 A solenoid has 1000 turns per meter carrying 3 A. The magnetic field inside is: (A) 3.77 x 10⁻³ T (B) 1.26 x 10⁻³ T (C) 2 x 10⁻³ T (D) 6 x 10⁻³ T',
 'Q.4 Two long parallel wires are 0.20 m apart and carry currents I₁ = 3 A and I₂ = 5 A in the same direction. The force per meter on each wire is: (A) 1.5 x 10⁻⁵ N/m (B) 3.0 x 10⁻⁵ N/m (C) 1.5 x 10⁻⁶ N/m (D) 3.0 x 10⁻⁶ N/m',
 'Q.5 A proton (q = 1.6 x 10⁻¹⁹ C, m = 1.67 x 10⁻²⁷ kg) moves at 1 x 10⁶ m/s perpendicular to a magnetic field of 0.2 T. Its acceleration in the field is: (A) 1.92 x 10¹³ m/s² (B) 3.2 x 10¹³ m/s² (C) 9.6 x 10¹² m/s² (D) Zero',
 'Q.6 A loop of radius 0.1 m in a field B = 0.5 T is pulled out in 0.02 s. The induced emf

In [77]:
subject = [physics_questions,chemistry_questions,maths_questions]  
topic = [physics_topics,chemistry_topics,maths_topics]
topic


[['Vectors',
  'Types of Forces',
  'Newton’s Laws of Motion',
  'Friction',
  'Basics of calculus',
  '1 D Kinematics',
  '2 D Kinematics',
  'Circular Motion',
  'Work, Power and Energy',
  'Center of Mass',
  'Conservation of Momentum',
  'Rotation',
  'Fluid Statics and Dynamics',
  'Properties of Solids',
  'Surface Tension and Viscosity',
  'Simple Harmonic Motion',
  'Combination of SHM',
  'Damping and Resonance',
  'Waves on a string',
  'Sound Waves',
  'Light Waves and Interference',
  'Ray Optics',
  'Thermal Expansion and Calorimetry',
  'Kinetic Theory of Gases',
  '1st Law of Thermodynamics & specific heats',
  'Heat transfer',
  'Electrostatics',
  'Dielectrics and Capacitors',
  'Current Electricity',
  'Magnetism',
  'Electromagnetic Induction',
  'Alternating Current',
  'Modern Physics',
  'Gravitation'],
 ['Periodic Table',
  'Chemical Bonding',
  'Atomic Structure',
  'Mole Concept',
  'Gaseous State',
  'Thermodynamics',
  'Thermochemistry',
  'Chemical Equilibri

In [78]:
subject_names = ["physics", "chemistry", "maths"]
result = {}

for model in models: # type: ignore
    model_name = getattr(model, 'model', str(model)).split(":")[0]
    os.makedirs(model_name, exist_ok=True) # type: ignore
    print(f"Using model: {model_name}")
    
    result[model_name] = {}

    for i, subject_set in enumerate(subject): # type: ignore
        all_results = []
        print(f"Processing subject: {subject_names[i]}...")

        for j, q in enumerate(subject_set):
            print(f"Processing question {j+1}...")
            if j % 10 == 0 and j != 0:
                time.sleep(2)  # type: ignore # Pause 2 seconds every 10 questions to avoid rate limits

            topic_data = get_topic_vector(q, topic[i], model) # type: ignore
            all_results.append({
                "question_id": j+1 ,
                "question": q,
                "topic_vector": topic_data
            })
            print(all_results[-1])

        all_results_df = pd.DataFrame(all_results) # type: ignore
        all_results_df.to_excel(f"{model_name}/{subject_names[i]}.xlsx", index=False)
        result[model_name][subject_names[i]] = all_results
        print(f"Completed subject: {subject_names[i]}")
        print(result)


Using model: gpt-oss
Processing subject: physics...
Processing question 1...
{'question_id': 1, 'question': 'Q.1 The frequency of revolution of a charged particle in a uniform B-field does not depend on: (A) Magnetic field (B) Mass (C) Charge (D) Velocity', 'topic_vector': {'Magnetism': 100}}
Processing question 2...
{'question_id': 2, 'question': 'Q.2 A straight conductor of length 0.2 m moves at 3 m/s perpendicular to B = 0.5 T. The induced emf is: (A) 0.20 V (B) 0.10 V (C) 0.05 V (D) 0.30 V', 'topic_vector': {'Electromagnetic Induction': 100}}
Processing question 3...
{'question_id': 3, 'question': 'Q.3 A solenoid has 1000 turns per meter carrying 3 A. The magnetic field inside is: (A) 3.77 x 10⁻³ T (B) 1.26 x 10⁻³ T (C) 2 x 10⁻³ T (D) 6 x 10⁻³ T', 'topic_vector': {'Magnetism': 100}}
Processing question 4...
{'question_id': 4, 'question': 'Q.4 Two long parallel wires are 0.20 m apart and carry currents I₁ = 3 A and I₂ = 5 A in the same direction. The force per meter on each wire is:

In [82]:
print(result)

{'gpt-oss': {'physics': [{'question_id': 1, 'question': 'Q.1 The frequency of revolution of a charged particle in a uniform B-field does not depend on: (A) Magnetic field (B) Mass (C) Charge (D) Velocity', 'topic_vector': {'Magnetism': 100}}, {'question_id': 2, 'question': 'Q.2 A straight conductor of length 0.2 m moves at 3 m/s perpendicular to B = 0.5 T. The induced emf is: (A) 0.20 V (B) 0.10 V (C) 0.05 V (D) 0.30 V', 'topic_vector': {'Electromagnetic Induction': 100}}, {'question_id': 3, 'question': 'Q.3 A solenoid has 1000 turns per meter carrying 3 A. The magnetic field inside is: (A) 3.77 x 10⁻³ T (B) 1.26 x 10⁻³ T (C) 2 x 10⁻³ T (D) 6 x 10⁻³ T', 'topic_vector': {'Magnetism': 100}}, {'question_id': 4, 'question': 'Q.4 Two long parallel wires are 0.20 m apart and carry currents I₁ = 3 A and I₂ = 5 A in the same direction. The force per meter on each wire is: (A) 1.5 x 10⁻⁵ N/m (B) 3.0 x 10⁻⁵ N/m (C) 1.5 x 10⁻⁶ N/m (D) 3.0 x 10⁻⁶ N/m', 'topic_vector': {'Magnetism': 100}}, {'questi

In [80]:
# def get_topic_vector(question,topic_string = maths_topics):
#     """Get the topic vector for a single question"""
#     try:
#         chain = accurate_prompt | model
#         response = chain.invoke({"maths_questions": question ,"maths_topics":topic_string})
#         response_text = response.content.strip()
#         json_start = response_text.find('{')
#         json_end = response_text.rfind('}') + 1
#         json_str = response_text[json_start:json_end]
        
#         return json.loads(json_str)
#     except Exception as e:
#         print(f"Error: {e}")
#         return {"Error": 100.0}
    

In [81]:

# all_results = []
# for i, q in enumerate(maths_questions):
#     print(f"Processing question {i+1}...")
#     if i%10 == 0 and i != 0:
#         time.sleep(2)  # Pause for 2 seconds every 10 questions to avoid rate limits
#     if i == 10:
#         break
#     topic_data = get_topic_vector(q)
#     all_results.append({
#         "question_id": i+1,
#         "question": q,
#         "topic_vector": topic_data
#     })