
# MOUNT DRIVE & LOAD DOTENV

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install openai==0.28
!pip install python-dotenv
import openai
from tqdm.auto import tqdm
import os
import json
from dotenv import load_dotenv
_ = load_dotenv("drive/My Drive/genAI project/.env")
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m41.0/76.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [3]:
lang = "en"  # "en" or "de"

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device}")

Using: cpu


In [6]:
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai.api_base = os.environ.get("OPENAI_API_BASE")

In [7]:
def get_single_answer_gpt(model_id, prompt):
    for i in range(10):
        try:
            completion = openai.ChatCompletion.create(
                model = model_id,
                messages = [{"role": "user", "content": prompt}],
                stream = False,
                temperature = 0
            ).choices[0].message.content
            break
        except Exception as e:
            completion = f"AN ERROR OCCURRED! {e}"
    return completion

def get_single_answer_hf(model, tokenizer, prompt, device=device):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    gen = model.generate(
        input_ids,
        num_beams = 2,
        temperature = 0,
        max_length = 1024,
        attention_mask = torch.ones(input_ids.shape, device=device),
        eos_token_id = 50256,
        pad_token_id = 50256,
        repetition_penalty =
    )[0]
    return tokenizer.decode(gen, skip_special_tokens=True)[len(prompt):]

In [8]:
# gpt-3.5-turbo
answers = []
with open(f"drive/My Drive/genAI project/full_stories_{lang}.json", "r", encoding="utf-8") as f:
    full_stories = json.load(f)
for fs in tqdm(full_stories):
    completion = get_single_answer_gpt("gpt-3.5-turbo", fs["prompt"])
    fs["completion"] = completion
    answers.append(fs)
with open(f"drive/My Drive/genAI project/answers_{lang}_gpt.json", "w", encoding="utf-8") as f:
    json.dump(answers, f, indent="    ", ensure_ascii=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
# TS pretrained
model_id = "roneneldan/TinyStories-1M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

answers = []
with open(f"drive/My Drive/genAI project/full_stories_{lang}.json", "r", encoding="utf-8") as f:
    full_stories = json.load(f)
for fs in tqdm(full_stories):
    completion = get_single_answer_hf(model, tokenizer, fs["prompt"], device=device)
    fs["completion"] = completion
    answers.append(fs)
with open(f"drive/My Drive/genAI project/answers_{lang}_ts.json", "w", encoding="utf-8") as f:
    json.dump(answers, f, indent="    ", ensure_ascii=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/48.6M [00:00<?, ?B/s]

  0%|          | 0/100 [00:00<?, ?it/s]



In [10]:
# TS own
model_name = "model-1m-en-2024-03-22T09:06:10.857249"
model_path = f"drive/My Drive/genAI project/{model_name}"
if not os.path.exists(model_path):
    print("ERROR: MODEL PATH DOES NOT EXIST")
    raise NameError
model_id = "roneneldan/TinyStories-1M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

answers = []
with open(f"drive/My Drive/genAI project/full_stories_{lang}.json", "r", encoding="utf-8") as f:
    full_stories = json.load(f)
for fs in tqdm(full_stories):
    completion = get_single_answer_hf(model, tokenizer, fs["prompt"], device=device)
    fs["completion"] = completion
    answers.append(fs)
with open(f"drive/My Drive/genAI project/answers_{lang}_own.json", "w", encoding="utf-8") as f:
    json.dump(answers, f, indent="    ", ensure_ascii=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
lang = "de"

In [12]:
# gpt-3.5-turbo
answers = []
with open(f"drive/My Drive/genAI project/full_stories_{lang}.json", "r", encoding="utf-8") as f:
    full_stories = json.load(f)
for fs in tqdm(full_stories):
    completion = get_single_answer_gpt("gpt-3.5-turbo", fs["prompt"])
    fs["completion"] = completion
    answers.append(fs)
with open(f"drive/My Drive/genAI project/answers_{lang}_gpt.json", "w", encoding="utf-8") as f:
    json.dump(answers, f, indent="    ", ensure_ascii=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# TS own
model_name = "MISSING"
model_path = f"drive/My Drive/genAI project/{model_name}}"
if not os.path.exists(model_path):
    print("ERROR: MODEL PATH DOES NOT EXIST")
    raise NameError
model_id = "roneneldan/TinyStories-1M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

answers = []
with open(f"drive/My Drive/genAI project/full_stories_{lang}.json", "r", encoding="utf-8") as f:
    full_stories = json.load(f)
for fs in tqdm(full_stories):
    completion = get_single_answer_hf(model, tokenizer, fs["prompt"], device=device)
    fs["completion"] = completion
    answers.append(fs)
with open(f"drive/My Drive/genAI project/answers_{lang}_own.json", "w", encoding="utf-8") as f:
    json.dump(answers, f, indent="    ", ensure_ascii=False)