<a href="https://colab.research.google.com/github/20wiz/Common-Sense-Reasoning-ARC/blob/dev/Common_Sense_Reasoning_with_phi_1_5_cot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CoT by prompt

In [None]:
# !pip install datasets

In [1]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu121
True


In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import random
import json
from datetime import datetime


model_name = "microsoft/phi-1_5"
device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [49]:

class ARCEvaluator:
    def init_model(self, model_name="microsoft/phi-1_5", device=None):
        self.model_name = model_name
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def load_model(self, model, tokenizer, device):
        self.tokenizer = tokenizer
        self.device = device
        self.model = model
        self.model.eval()

    def make_prompt(self, question, choices, use_cot=True):
        """
        Creates a structured prompt for ARC questions with optional chain-of-thought reasoning.

        Args:
            question (str): The question text
            choices (dict): Dictionary containing 'text' and 'label' lists
            use_cot (bool): Whether to use chain-of-thought prompting
        """
        if use_cot:
            prompt = (
                "Let's focus on the following question:\n\n"
                f"Question: {question}\n\n"
                "Choices:\n"
            )
            for text, label in zip(choices['text'], choices['label']):
                prompt += f"{label}: {text}\n"

            prompt += ("\nLet's think about this:\n"
                      "1. First, let's understand what the question is asking.\n"
                      "2. Then, let's analyze each choice carefully.\n"
                    #   "3. Finally, we'll choose the most logical answer.\n\n"
                      "Reasoning:\n"
                      f"1. The question is asking about {question}\n"
                      "2. Let's examine each option and provide the reasoning only, :\n")

            # for text, label in zip(choices['text'], choices['label']):
            #     prompt += f"   Option {label}: {text} - \n"

            # prompt += ("\n3. Based on this analysis, the correct answer is ")
        else:
            # Original simple prompt
            prompt = f"Question: {question}\nChoices:\n"
            for text, label in zip(choices['text'], choices['label']):
                prompt += f"{label}: {text}\n"
            prompt += "Answer: "

        return prompt
    
    def get_final_prompt(self):
        prompt = ("\n\n*Please provide only the exact answer and nothing else."
                "\n*Do not include any explanation, reasoning, or use cases."
                "\n*Your response must be limited to the concise answer alone."
                "\n\nThe Answer:\nBased on above reasoning, the correct answer among choices is "
                )
        return prompt

    def predict_answer(self, prompt, use_cot=True):
        """
        Generate model prediction for a given prompt.

        Args:
            prompt (str): Formatted question prompt
            use_cot (bool): Whether using chain-of-thought prompting
        """
        inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=200 if use_cot else 1,  # Longer generation for CoT
                # max_new_tokens=100 if use_cot else 1,  # Longer generation for CoT
                do_sample=True if use_cot else False,  # Allow some randomness for CoT
                temperature=0.8 if use_cot else 1.0,   # Lower temperature for more focused reasoning
                top_p=0.9 if use_cot else 1.0,        # Nucleus sampling for CoT
                pad_token_id=self.tokenizer.eos_token_id
            )

            first_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # print(f"output_text : {output_text}")

        if use_cot:
            inputs = self.tokenizer(first_text + self.get_final_prompt(), return_tensors='pt').to(self.device)
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=200 if use_cot else 1,  # Longer generation for CoT
                    # max_new_tokens=100 if use_cot else 1,  # Longer generation for CoT
                    do_sample=True if use_cot else False,  # Allow some randomness for CoT
                    temperature=0.8 if use_cot else 1.0,   # Lower temperature for more focused reasoning
                    top_p=0.9 if use_cot else 1.0,        # Nucleus sampling for CoT
                    pad_token_id=self.tokenizer.eos_token_id
                )
            output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract the final answer from the reasoning chain
            # Look for patterns like "answer is X" or "therefore X" or just the last character
            output_lines = output_text.split('\n')
            answer = None

            # Try to find the answer in the last few lines
            for line in reversed(output_lines):
                line = line.strip().lower()
                if "answer is" in line:
                    answer = line[-1].upper()
                    break
                elif "therefore" in line:
                    answer = line[-1].upper()
                    break

            # Fallback to last character if no clear answer found
            if not answer:
                answer = output_text.strip()[-1].upper()

            return answer, output_text
        else:
            return first_text.strip()[-1], first_text

    def evaluate_dataset(self, split='validation', num_samples=None, save_results=True, use_cot=True):
          """
          Evaluate model performance on ARC dataset.

          Args:
              split (str): Dataset split to evaluate ('validation' or 'test')
              num_samples (int): Number of samples to evaluate (None for all)
              save_results (bool): Whether to save detailed results to file
              use_cot (bool): Whether to use chain-of-thought prompting
          """
          dataset = load_dataset('ai2_arc', 'ARC-Challenge')[split]

          if num_samples is not None:
            #   indices = random.sample(range(len(dataset)), min(num_samples, len(dataset)))
            #   dataset = dataset.select(indices)
              dataset = dataset.select(range(num_samples))
        

          results = []
          correct = 0
          total = 0

          for sample in tqdm(dataset, desc=f"Evaluating {split} set"):
              print(type(sample))
              print(sample)
              question = sample["question"]
              choices = sample["choices"]
              gold = sample["answerKey"]
              gold_text = [text for text, l in zip(choices['text'], choices['label']) if l == gold]
              print("question & answer:")
              print(question)
            #   print(choices)
              print("gold")
              print(gold, gold_text)

              prompt = self.make_prompt(question, choices, use_cot)
              # print(f"Prompt:\n{prompt}")
              predicted_answer, full_response = self.predict_answer(prompt, use_cot)
              print(f"\nPredicted Answer: {predicted_answer} \n")
              print(f"Full Response: {len(full_response)} -> {full_response}")

              # check if predicted_answer includes gold

              # gold = f"Option {gold}"
              answer = full_response.split("the correct answer among choices is")
            #   print(f"answer1:  {answer}")
              # not use the last one,  can occur multiple times
              answer2 = answer[1].split('\n')
            #   print(f"answer2 : {answer2}")
              answer3 = answer2[1]
            #   print(f"answer3 : {answer3}")

              # is_correct = gold in predicted_answer
              is_correct = False
              if type(gold_text) == list:
                  gold_text = gold_text[0]
              try:
                is_correct = gold in answer3 and gold_text in answer3

              # is_correct = predicted_answer == gold
                if is_correct:
                    correct += 1
              except(Exception) as e:
                  print(e)  
                  print(answer3)
                  
              total += 1

              results.append({
                  'id': sample['id'],
                  'question': question,
                  'choices': choices,
                  'gold_answer': gold,
                  'predicted_answer': predicted_answer,
                  'full_reasoning': full_response if use_cot else None,
                  'is_correct': is_correct
              })

              accuracy = correct / total
              print(f"\nAccuracy until now: {correct}/{total} = {accuracy:.2%}")

          accuracy = correct / total
          print(f"\nFinal Accuracy: {correct}/{total} = {accuracy:.2%}")

          if save_results:
              self._save_results(results, accuracy, split, use_cot)

          return accuracy, results
    

evaluator = ARCEvaluator()
evaluator.load_model(model, tokenizer, device)

accuracy_cot, results_cot = evaluator.evaluate_dataset(
    split='validation',
    num_samples=10,  # Set to None to evaluate full dataset
    save_results=False,
    use_cot=True
)    

Evaluating validation set:   0%|          | 0/10 [00:00<?, ?it/s]

<class 'dict'>
{'id': 'Mercury_SC_407695', 'question': 'Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?', 'choices': {'text': ['Put the objects in groups.', 'Change the height of the ramp.', 'Choose different objects to roll.', 'Record the details of the investigation.'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'D'}
question & answer:
Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?
gold
D ['Record the details of the investigation.']


Evaluating validation set:  10%|█         | 1/10 [00:05<00:49,  5.50s/it]


Predicted Answer: S 

Full Response: 2864 -> Let's focus on the following question:

Question: Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?

Choices:
A: Put the objects in groups.
B: Change the height of the ramp.
C: Choose different objects to roll.
D: Record the details of the investigation.

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?
2. Let's examine each option and provide the reasoning only, :
   - Choice A: Put the objects in groups.
   - Reasoning: By putting the objects in groups, Juan and LaKeisha can compare the results and see if there is a difference in how far each object rolls.

Evaluating validation set:  20%|██        | 2/10 [00:10<00:43,  5.47s/it]


Predicted Answer: . 

Full Response: 2901 -> Let's focus on the following question:

Question: High-pressure systems stop air from rising into the colder regions of the atmosphere where water can condense. What will most likely result if a high-pressure system remains in an area for a long period of time?

Choices:
A: fog
B: rain
C: drought
D: tornado

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about High-pressure systems stop air from rising into the colder regions of the atmosphere where water can condense. What will most likely result if a high-pressure system remains in an area for a long period of time?
2. Let's examine each option and provide the reasoning only, :
   A: Fog
   Reasoning: If a high-pressure system remains in an area for a long period of time, it means that the air is not moving freely and cannot rise into the colder regions of the atmosphere. Th

Evaluating validation set:  30%|███       | 3/10 [00:16<00:38,  5.49s/it]


Predicted Answer: N 

Full Response: 3620 -> Let's focus on the following question:

Question: Students visited the Morris W. Offit telescope located at the Maryland Space Grant Observatory in Baltimore. They learned about the stars, planets, and moon. The students recorded the information below. • Star patterns stay the same, but their locations in the sky seem to change. • The sun, planets, and moon appear to move in the sky. • Proxima Centauri is the nearest star to our solar system. • Polaris is a star that is part of a pattern of stars called the Little Dipper. Which statement best explains why the sun appears to move across the sky each day?

Choices:
A: The sun revolves around Earth.
B: Earth rotates around the sun.
C: The sun revolves on its axis.
D: Earth rotates on its axis.

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about Students visited the Morris W. Of

Evaluating validation set:  40%|████      | 4/10 [00:22<00:33,  5.53s/it]


Predicted Answer: E 

Full Response: 2945 -> Let's focus on the following question:

Question: Which topic area would be the best to research to find ways of reducing environmental problems caused by humans?

Choices:
A: converting sunlight into electricity
B: looking for new coal reserves
C: finding reservoirs that contain oil
D: converting forests into farmland

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about Which topic area would be the best to research to find ways of reducing environmental problems caused by humans?
2. Let's examine each option and provide the reasoning only, :
   A: converting sunlight into electricity - Solar energy is a clean and renewable source of power. It reduces greenhouse gas emissions and dependence on fossil fuels.
   B: looking for new coal reserves - Coal is a fossil fuel that releases a significant amount of carbon dioxide when b

Evaluating validation set:  50%|█████     | 5/10 [00:27<00:27,  5.57s/it]


Predicted Answer: E 

Full Response: 2919 -> Let's focus on the following question:

Question: One year, the oak trees in a park began producing more acorns than usual. The next year, the population of chipmunks in the park also increased. Which best explains why there were more chipmunks the next year?

Choices:
A: Shady areas increased.
B: Food sources increased.
C: Oxygen levels increased.
D: Available water increased.

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about One year, the oak trees in a park began producing more acorns than usual. The next year, the population of chipmunks in the park also increased. Which best explains why there were more chipmunks the next year?
2. Let's examine each option and provide the reasoning only, :
   A: Shady areas increased.
   B: Food sources increased.
   C: Oxygen levels increased.
   D: Available water increased.
   Reas

Evaluating validation set:  60%|██████    | 6/10 [00:33<00:22,  5.60s/it]


Predicted Answer: E 

Full Response: 2699 -> Let's focus on the following question:

Question: Which characteristic of a cheetah is more likely to be learned rather than inherited?

Choices:
A: speed
B: a spotted coat
C: hunting strategies
D: claws that do not retract

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about Which characteristic of a cheetah is more likely to be learned rather than inherited?
2. Let's examine each option and provide the reasoning only, :
   A: speed - Cheetahs are known for their incredible speed, and this characteristic is something they learn through practice and training.
   B: a spotted coat - Cheetahs have a unique spotted coat, and this characteristic is likely inherited from their parents.
   C: hunting strategies - Cheetahs have evolved specific hunting strategies over time, and this characteristic is not something they learn from th

Evaluating validation set:  70%|███████   | 7/10 [00:39<00:16,  5.64s/it]


Predicted Answer: . 

Full Response: 2590 -> Let's focus on the following question:

Question: How are the particles in a block of iron affected when the block is melted?

Choices:
A: The particles gain mass.
B: The particles contain less energy.
C: The particles move more rapidly.
D: The particles increase in volume.

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about How are the particles in a block of iron affected when the block is melted?
2. Let's examine each option and provide the reasoning only, :
   - Option A: The particles gain mass.
   - Option B: The particles contain less energy.
   - Option C: The particles move more rapidly.
   - Option D: The particles increase in volume.

Reasoning for Option A:
When a block of iron is melted, the particles gain mass. This is because the heat energy causes the particles to vibrate faster, which increases their kinetic

Evaluating validation set:  80%|████████  | 8/10 [00:44<00:11,  5.63s/it]


Predicted Answer: Y 

Full Response: 2821 -> Let's focus on the following question:

Question: Plants and animals need food for growth. What happens to most of the food that plants produce?

Choices:
A: Food is released as gas.
B: Food is converted to water.
C: Food is stored for future use.
D: Food is used to absorb sunlight.

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about Plants and animals need food for growth. What happens to most of the food that plants produce?
2. Let's examine each option and provide the reasoning only, :
   - Choice A: Food is released as gas. Plants release carbon dioxide through a process called photosynthesis. This gas is a byproduct of the food-making process and helps plants produce oxygen.
   - Choice B: Food is converted to water. Plants absorb water from the soil through their roots and use it for various functions like transporting

Evaluating validation set:  90%|█████████ | 9/10 [00:50<00:05,  5.61s/it]


Predicted Answer: . 

Full Response: 2870 -> Let's focus on the following question:

Question: A student wants to determine the effect of garlic on the growth of a fungus species. Several samples of fungus cultures are grown in the same amount of agar and light. Each sample is given a different amount of garlic. What is the independent variable in this investigation?

Choices:
A: amount of agar
B: amount of light
C: amount of garlic
D: amount of growth

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about A student wants to determine the effect of garlic on the growth of a fungus species. Several samples of fungus cultures are grown in the same amount of agar and light. Each sample is given a different amount of garlic. What is the independent variable in this investigation?
2. Let's examine each option and provide the reasoning only, :
   A: Amount of agar
   Reasoning:

Evaluating validation set: 100%|██████████| 10/10 [00:55<00:00,  5.58s/it]


Predicted Answer: - 

Full Response: 2814 -> Let's focus on the following question:

Question: Scientific models are very common. For which purpose below would a physical model be least helpful?

Choices:
A: simulating scientific phenomena
B: simplifying a complex idea
C: allowing visualization
D: displaying data

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
Reasoning:
1. The question is asking about Scientific models are very common. For which purpose below would a physical model be least helpful?
2. Let's examine each option and provide the reasoning only, :
   - A: simulating scientific phenomena
   - B: simplifying a complex idea
   - C: allowing visualization
   - D: displaying data

Reasoning:
A: Simulating scientific phenomena is a useful method of understanding complex ideas and phenomena. It allows scientists to create a model that accurately represents the real-world scenario, making it easier t




In [None]:
# model_name = "microsoft/phi-1_5"
# device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [8]:
evaluator = ARCEvaluator()
evaluator.load_model(model, tokenizer, device)

In [None]:
    # Run evaluation with chain-of-thought
print("Evaluating with Chain-of-Thought:")
accuracy_cot, results_cot = evaluator.evaluate_dataset(
    split='validation',
    num_samples=None,  # Set to None to evaluate full dataset
    save_results=False,
    use_cot=True
)

In [35]:
accuracy_cot

0.28762541806020064

In [36]:
results_cot[0]

{'id': 'Mercury_SC_407695',
 'question': 'Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?',
 'choices': {'text': ['Put the objects in groups.',
   'Change the height of the ramp.',
   'Choose different objects to roll.',
   'Record the details of the investigation.'],
  'label': ['A', 'B', 'C', 'D']},
 'gold_answer': 'D',
 'predicted_answer': 'S',
 'full_reasoning': 'Let\'s solve this step by step:\n\nQuestion: Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?\n\nChoices:\nA: Put the objects in groups.\nB: Change the height of the ramp.\nC: Choose different objects to roll.\nD: Record the details of the investigation.\n\nLet\'s think about this:\n1. First, let\'s understand what the question is asking.\n2. Then, let\'s analyze each choice carefully.\n3. Finally, we\'l

In [37]:
print(results_cot[0]['full_reasoning'])

Let's solve this step by step:

Question: Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?

Choices:
A: Put the objects in groups.
B: Change the height of the ramp.
C: Choose different objects to roll.
D: Record the details of the investigation.

Let's think about this:
1. First, let's understand what the question is asking.
2. Then, let's analyze each choice carefully.
3. Finally, we'll choose the most logical answer.

Reasoning:
1. The question is asking about Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?
2. Let's examine each option:
   Option A: Put the objects in groups. - 
   Option B: Change the height of the ramp. - 
   Option C: Choose different objects to roll. - 
   Option D: Record the details of the investigation. - 

3. Based on this analysis, the cor