```bash
conda create --name ani python=3.8
conda install -c conda-forge jupyterlab
pip install open_clip_torch
pip install stable-baselines3
pip install gym[all]
pip install pyglet==1.5.27
```

In [8]:
import gym
from stable_baselines3 import DQN
import open_clip
from PIL import Image
import torch
import numpy as np

In [9]:
def disable_view_window():
        from gym.envs.classic_control import rendering
        org_constructor = rendering.Viewer.__init__

        def constructor(self, *args, **kwargs):
            org_constructor(self, *args, **kwargs)
            self.window.set_visible(visible=False)

        rendering.Viewer.__init__ = constructor

In [10]:
disable_view_window()

In [11]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32-quickgelu', pretrained='laion400m_e32')
tokenizer = open_clip.get_tokenizer('ViT-B-32-quickgelu')

In [15]:
class CLIPEnv():
    def __init__(self, env, clip_model, tokenizer, prompt):
        self.env = env
        self.model = clip_model
        self.tokenizer = tokenizer
        self.text_features = self.model.encode_text(self.tokenizer([prompt]))
        self.text_features /= self.text_features.norm(dim=-1, keepdim=True)
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.metadata = self.env.metadata
        self.clip_rewards = []

    
    def reset(self):
        return self.env.reset()
    
    def step(self, action):
        next_st, rwd, done, info = self.env.step(action)
        img = self.env.render(mode="rgb_array")
        clip_rwd = self.get_clip_reward(img)
        self.clip_rewards.append(clip_rwd)
        return next_st, rwd, done, info
    
    def get_clip_reward(self, state):
        image = preprocess(Image.fromarray(np.uint8(state))).unsqueeze(0)
        with torch.no_grad(), torch.cuda.amp.autocast():
            image_features = self.model.encode_image(image)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            # text_probs = (image_features @ self.text_features.T).softmax(dim=-1)
            sim = (image_features @ self.text_features.T)
        return sim.cpu().detach().numpy()[0][0]

In [19]:
prompt_rewards = {}

In [20]:
prompt = 'Spaceship is on the landing pad'
env = gym.make('LunarLander-v2')
cl_env = CLIPEnv(env, model, tokenizer, prompt)

agent = DQN('MlpPolicy', cl_env, verbose=1)
agent.learn(total_timesteps=100)

obs = env.reset()
for i in range(100):
    action, _states = agent.predict(obs)
    obs, rewards, dones, info = cl_env.step(action)

prompt_rewards[prompt] = cl_env.clip_rewards.copy()

env.close()

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [21]:
prompt_rewards

{'Spaceship is on the landing pad': [0.25946733,
  0.24547678,
  0.25780526,
  0.2588827,
  0.2618871,
  0.26279867,
  0.25091952,
  0.26712766,
  0.26320586,
  0.25974947,
  0.26265264,
  0.26522928,
  0.2654784,
  0.2722779,
  0.27108845,
  0.27180612,
  0.27496165,
  0.27054948,
  0.27047753,
  0.28128058,
  0.27066228,
  0.2748502,
  0.27924854,
  0.26559317,
  0.27097774,
  0.27635348,
  0.2721864,
  0.2606356,
  0.27022314,
  0.27579564,
  0.28191072,
  0.27033848,
  0.27247488,
  0.26242554,
  0.2628427,
  0.27209,
  0.26564938,
  0.27402133,
  0.27561238,
  0.2753079,
  0.27709934,
  0.2702515,
  0.2740208,
  0.27855244,
  0.29012167,
  0.27526802,
  0.27416244,
  0.2759431,
  0.27367398,
  0.2786312,
  0.2795788,
  0.28335172,
  0.28164524,
  0.27407914,
  0.26908535,
  0.26792744,
  0.26853913,
  0.27623358,
  0.28295523,
  0.27170193,
  0.26674402,
  0.2638325,
  0.2655369,
  0.25878203,
  0.252956,
  0.25829867,
  0.26486117,
  0.2583611,
  0.26008672,
  0.24896094,
  0.255