In [None]:
!pip install -Uq openai wandb

In [None]:
import openai
import wandb
from pathlib import Path
import pandas as pd
import numpy as np
import json
from tqdm import tqdm

In [None]:
import os
os.environ["WANDB_API_KEY"] = ""
!wandb login

In [None]:
# create a job for splitting dataset
run = wandb.init(project='GPT-3', job_type='finetune')

In [None]:
# download full dataset
artifact = run.use_artifact('borisd13/GPT-3/wiki-dataset:latest', type='dataset')
dataset_path = artifact.get_path('wiki_title_description.jsonl').download()

In [None]:
!head $dataset_path

In [None]:
!openai tools fine_tunes.prepare_data -f $dataset_path

In [None]:
# check number of samples
!wc -l $dataset_path

In [None]:
# n_train = 50_000
# n_valid = 10_000

n_train = 500
n_valid = 50

In [None]:
wandb.config.update({'n_train': n_train,
                     'n_valid': n_valid})

In [None]:
!head -n $n_train $dataset_path > wiki_train.jsonl
!tail -n $n_valid $dataset_path > wiki_valid.jsonl

In [None]:
# Create tables for better visualization (optional)
df_train = pd.read_json('wiki_train.jsonl', orient='records', lines=True)
df_valid = pd.read_json('wiki_valid.jsonl', orient='records', lines=True)
table_train = wandb.Table(dataframe=df_train)
table_valid = wandb.Table(dataframe=df_valid)

In [None]:
# Create artifacts
artifact_train = wandb.Artifact('train-wiki_train.jsonl', type='training_files', metadata={'samples': n_train})
artifact_train.add_file('wiki_train.jsonl')
artifact_train.add(table_train, 'wiki_train')

artifact_valid = wandb.Artifact('valid-wiki_valid.jsonl', type='validation_files', metadata={'samples': n_valid})
artifact_valid.add_file('wiki_valid.jsonl')
artifact_valid.add(table_valid, 'wiki_valid')

# Log files
run.log_artifact(artifact_train)
run.log_artifact(artifact_valid)

In [None]:
# keep entity (typically your wandb username) for reference of artifact later in this demo
entity = wandb.run.entity

In [None]:
# wandb.finish()

In [None]:
artifact_train = run.use_artifact(f'{entity}/GPT-3/train-wiki_train.jsonl:v0', type='training_files')
train_file = artifact_train.get_path('wiki_train.jsonl').download()

artifact_valid = run.use_artifact(f'{entity}/GPT-3/valid-wiki_valid.jsonl:v0', type='validation_files')
valid_file = artifact_valid.get_path('wiki_valid.jsonl').download()

In [None]:
# Enter credentials
%env OPENAI_API_KEY= ""

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
model = 'ada'  # can be ada, babbage or curie
n_epochs = 1
batch_size = 16
learning_rate_multiplier = 0.1
prompt_loss_weight = 0.1

In [None]:
!openai api fine_tunes.create \
    -t $train_file \
    -v $valid_file \
    -m $model \
    --n_epochs $n_epochs \
    --batch_size $batch_size \
    --learning_rate_multiplier $learning_rate_multiplier \
    --prompt_loss_weight $prompt_loss_weight

In [None]:
!openai api fine_tunes.follow -i ft-JIkbRhS6oIFJHlYzZKSyzcaF

In [None]:
!openai api fine_tunes.get -i ft-JIkbRhS6oIFJHlYzZKSyzcaF

In [None]:
!openai api completions.create -m ada:ft-personal-2022-12-05-09-49-23 -p "The Death of Germanicus\n\n###\n\n"

**Train with GoogleWIT Data**

In [None]:
df_train = pd.read_json('Wikipedia_Japanese_All_train_prepared.jsonl', orient='records', lines=True)
df_valid = pd.read_json('Wikipedia_Japanese_All_valid_prepared.jsonl', orient='records', lines=True)
table_train = wandb.Table(dataframe=df_train)
table_valid = wandb.Table(dataframe=df_valid)

In [None]:
# Create artifacts
artifact_train = wandb.Artifact('train-Wikipedia_Japanese_All_train_prepared.jsonl', type='training_files')
artifact_train.add_file('Wikipedia_Japanese_All_train_prepared.jsonl')
artifact_train.add(table_train, 'Wikipedia_Japanese_All_train_prepared')

artifact_valid = wandb.Artifact('valid-Wikipedia_Japanese_All_valid_prepared.jsonl', type='validation_files')
artifact_valid.add_file('Wikipedia_Japanese_All_valid_prepared.jsonl')
artifact_valid.add(table_valid, 'Wikipedia_Japanese_All_valid_prepared')

# Log files
run.log_artifact(artifact_train)
run.log_artifact(artifact_valid)

In [None]:
# keep entity (typically your wandb username) for reference of artifact later in this demo
entity = wandb.run.entity

In [None]:
artifact_train = run.use_artifact(f'{entity}/GPT-3/train-Wikipedia_Japanese_All_train_prepared.jsonl:v0', type='training_files')
train_file = artifact_train.get_path('Wikipedia_Japanese_All_train_prepared.jsonl').download()

In [None]:
artifact_valid = run.use_artifact(f'{entity}/GPT-3/valid-Wikipedia_Japanese_All_valid_prepared.jsonl:v0', type='validation_files')
valid_file = artifact_valid.get_path('Wikipedia_Japanese_All_valid_prepared.jsonl').download()

In [None]:
# Enter credentials
%env OPENAI_API_KEY= ""

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
model = 'ada'  # can be ada, babbage or curie
n_epochs = 1
batch_size = 16
learning_rate_multiplier = 0.1
prompt_loss_weight = 0.1

In [None]:
!openai api fine_tunes.create \
    -t $train_file \
    -v $valid_file \
    -m $model \
    --n_epochs $n_epochs \
    --batch_size $batch_size \
    --learning_rate_multiplier $learning_rate_multiplier \
    --prompt_loss_weight $prompt_loss_weight

In [None]:
import openai
openai.api_key = ""

In [None]:
# create a completion
completion = openai.Completion.create(engine="ada:ft-personal-2022-12-05-09-49-23", prompt="Hello world")

In [None]:
completion

In [None]:
# print the completion
print(completion.choices[0].text)

In [None]:
# create a completion
completion = openai.Completion.create(engine="ada", prompt="Khawaja Muhammad Zaman of Luari\n\n###\n\n", max_tokens = 2036)

In [None]:
# print the completion
print(completion)

In [None]:
# !openai api fine_tunes.create \
#     -t /content/wiki_data_train_prepared.jsonl \
#     -v /content/wiki_data_valid_prepared.jsonl \
#     -m $model \
#     --n_epochs $n_epochs \
#     --batch_size $batch_size \
#     --learning_rate_multiplier $learning_rate_multiplier \
#     --prompt_loss_weight $prompt_loss_weight