In [None]:
!pip install -q -U google-generativeai
!pip install pyboy

!sudo apt install tesseract-ocr
!pip install pytesseract

In [None]:
!pip install transformers datasets

from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import requests
import torch

In [None]:
# Load a pre-trained TrOCR model and processor
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-printed')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-printed')

In [None]:
# Load an example image
image = Image.open('/content/drive/MyDrive/Colab Notebooks/Pokebot/image.png').convert('RGB')

# Preprocess the image
pixel_values = processor(images=image, return_tensors="pt").pixel_values

# Predict text from the image
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    generated_ids = model.generate(pixel_values)

# Decode the predicted text
predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

# Step 8: Display the predicted text
print("Predicted text:", predicted_text)

In [None]:
# load OCR model
import keras_ocr

detector = keras_ocr.detection.Detector(weights='clovaai_general')
detector.model.load_weights('/content/drive/MyDrive/Colab Notebooks/Pokebot/KerasOCRmodel/detector_weights.h5')
recognizer = keras_ocr.recognition.Recognizer(alphabet=recognizer_alphabet,weights='kurapan')

pipeline = keras_ocr.pipeline.Pipeline(detector=detector, recognizer=recognizer)

predictions = pipeline.recognize(images=[image])[0]

In [None]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
# Used to securely store your API key
from google.colab import userdata
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
import numpy as np
from scipy import signal

def check_frame_change(prev_frame, current_frame):
  # Convert images to numpy arrays for comparison
  current_frame_array = np.array(current_frame)
  prev_frame_array = np.array(prev_frame)

  # cor = signal.correlate(prev_frame_array, current_frame_array)

  # Check if the frames are different
  if np.array_equal(current_frame_array, prev_frame_array):
      return False
  else:
    # print(cor)
    return True

def check_not_empty(current_frame):
  extrema = current_frame.getextrema()
  if extrema[0] == extrema[1]:
    # This image is one solid color, so dont process it
    empty = True
  else:
    empty = False

  return not empty


In [None]:
from pyboy import PyBoy
import pytesseract
from IPython.display import display
import copy

rom_path = './drive/MyDrive/Colab Notebooks/Pokebot/Pokemon_Red.gb'
# rom_path = 'Pokemon\ROM\Pokemon_Red.gb'
pyboy = PyBoy(rom_path)

# Select the Gemini-Pro-Vision model
model = genai.GenerativeModel("gemini-pro-vision")

In [None]:
intro_prompt = "You are going to play a game where you are the main character.\
For all future prompts,\
your aim is to raise creatures called pokemon by battling them with other pokemon.\
Each pokemon has some skills that can used to attack other pokemon.\
You need to understand what is on the screen and suggest the correct action to take.\
Some actions can be like asking you for a name, asking to pick between few options, and \
also battling an opponent, by choosing a pokemon and some skills.\
Your outputs will be used directly to play the game through a python code. \
So keep your outputs short and display them in the form of comma separated instructions \
on how to navigate the screen to continue playing the game.\
Respond 'Yes' to this."

# Text prompt (optional)
text_prompt1 = "Read the text in the image. Decide if it is asking you to take an action or not.\
If there is no text respond 0. \
If the text is just telling a story, respond 1\
If the text is asking for an action or choice, respond 2."

text_prompt2 = "Read the question in the image.\
Pick an answer. Mention the chosen answer.\
If the question asks for name. Pick an existing name. \
Find the current cursor position. Right-pointing triangle shape is the cursor.\
Given the current cursor position, give instructions to take the cursor to the chosen answer.\
Then Give the instructions as a sequence of these commands :RIGHT, LEFT, UP, DOWN in the form of a list.\
Do not respond in comeplete sentences.\
Response should include only the chosen answer and the sequence of instructions."

In [None]:
from google.colab import output
pyboy = PyBoy(rom_path)

action = 1
prev_frame = None
frame_count = 0

story = []

while pyboy.tick(120,False):
  frame_count+=1

  # Start the game
  if frame_count == 1:
    pyboy.button('start', True)
    pil_image = pyboy.screen.image
    grayscale_image = pil_image.convert('L')
    combined_input = [intro_prompt, grayscale_image]
    response = model.generate_content(combined_input)
    print(response.text)
    prev_frame = pil_image

  else:
    # Extract the current frame as an image
    pil_image = pyboy.screen.image
    frame_change_check = check_frame_change(prev_frame, pil_image)
    chk_image = check_not_empty(pil_image)

    # if chk_image:
    if frame_change_check:
      print('Frames are different')
      # Perform OCR on the image
      extracted_text = pytesseract.image_to_string(pil_image)
      # print("Extracted text:")
      print(extracted_text)
      story.append(extracted_text)
      if prev_frame_change_check:
        pyboy.button('a', True)

    elif frame_change_check ==0 and chk_image:
      # Combine text and image for input (list format)
      rgb_image = pil_image.convert('RGB')
      combined_input = [text_prompt1, rgb_image]

      # Generate response
      response1 = model.generate_content(combined_input)
      print('Model response ',response1.text)
      if response1.text == '1':
        # Print the response
        print(response1.text)
        pyboy.button('a', True)
        # display(pil_image)
        # break
      elif response1.text =='2':
        display(pil_image)
        combined_input = [text_prompt2, rgb_image]
        # Generate response
        response2 = model.generate_content(combined_input)
        print(response2.text)

  output.clear()
  display(pil_image)
  prev_frame = pil_image
  prev_frame_change_check =frame_change_check
  if frame_count > 100:
    break