# Attempt to fine-tune GPT3.5 to translate between languages
The first language will be our made up language (see notes on get_bible as to why we think GPT is cheating)
The thesis is that fine-tuning will cause GPT3 to learn the fake language as a new language by assigning the embeddings of the
new words essentially to the same dimensional space as the english.  Should be trivial to translate it back as it is a word 
for word translation.



In [14]:
from lib.config import get_config
from lib.cipher import substitution_cipher
import json, random, time, os

# To install pip install pandas, openai, nltk
import pandas as pd
import openai # !pip install openai==0.27.9
from nltk.translate.bleu_score import sentence_bleu

FILENAME = "GPT3-5"
EPOCHS = 3  # Since we are repeating the data with different versions we don't want to overfit
SOURCE_BOOKS = ['MAT','LUK','JHN']
TARGET_BOOKS = ['MRK']
VERSIONS = ['eng-web', 'eng-asv', 'eng-kjv2006']
SPLIT_RATIO = 0.8
EXPERIMENT_NAME = "mt_lk_jn_to_mk_test1"  # Max 18 characters
FILENAME = FILENAME + "_" + EXPERIMENT_NAME
SEPERATOR = "\n---\n"

# set environment variable in ipython notebook
os.environ["OPENAI_API_KEY"] = get_config('openai')['api_key']
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
def system_message(source):
    return f"""You are an expert translator. When the user gives you input from {source} translate it to Birrig."""

In [4]:
bible = pd.read_csv('data/bible.bbe.csv')

# Assuming the gospels have a lot of overlap include the synoptic gospels plus John to train
# then predict Mark (which should be easy as Matthew and Luke may have copied from him)
train = bible[bible['book'].isin(SOURCE_BOOKS)]
test = bible[bible['book'].isin(TARGET_BOOKS)]

In [5]:
def write_file(df, file_handlers, versions, split=None):
    
        results = []
        # loop through the train and validate dataframes and add each row to a dataset
        for _, row in df.iterrows():
            # Loop through all the Bible Versions
            for item in versions:
                if not pd.isna(row[item]) and not pd.isna(row['birrig']):
                    # Create a GPT chat message we will teach GPT how to reply to
                    # Thus learning the new language
                    line = {
                        "messages": [
                            {"role": "system", "content": system_message(item)},
                            {"role": "user", "content": row[item]},
                            {"role": "assistant", "content": row['birrig']},
                        ]
                    }
                    # Add to results for now b/c I want to shuffle them
                    results.append((random.random(), line))
        
        results.sort(key=lambda x: x[0])
        
        # Remove trailing newline in file by this little cheat
        optional_newline = ["",""]
        for some_random_num, line in results:
            # Write to the file, if split to train (index 0) or validate (index 1)
            if split is None or some_random_num <= split or len(file_handlers) == 1:
                index = 0
            else:
                index = 1
            file_handlers[index].write(optional_newline[index] + json.dumps(line))
            optional_newline[index] = "\n"

with open(f'data/{FILENAME}_train.jsonl','w') as f1, open(f'data/{FILENAME}_validate.jsonl','w') as f2, open(f'data/{FILENAME}_test.jsonl','w') as f3:
    write_file(train, [f1,f2], VERSIONS, SPLIT_RATIO)
    write_file(test, [f3], VERSIONS)


In [6]:
openai.api_key = os.environ["OPENAI_API_KEY"]

files = {}
for part in ['train', 'validate']:
    try:
        res = openai.File.create(
            file=open(f'data/{FILENAME}_{part}.jsonl', "r"),
            purpose='fine-tune'
        )
        files[part] = res['id']
    except Exception as e:
        print(e, part, f'data/{FILENAME}_{part}.jsonl')

files

{'train': 'file-O4pTTX6oyp37UpfljotjizoL',
 'validate': 'file-RqOm21EnUVLA7m22agALfD3b'}

In [7]:
while True:
    try:
        res = openai.FineTuningJob.create(
            training_file=files['train'],
            validation_file=files['validate'],
            model="gpt-3.5-turbo",
            hyperparameters={
                "n_epochs":EPOCHS,
            },
            suffix=EXPERIMENT_NAME[0:18],
        )
        job_id = res["id"]

        break
    except openai.error.InvalidRequestError as e:
        if "File 'file-" in str(e) and "' is still being processed and is not ready to be used for fine-tuning." in str(e):
            print("File is still being processed. Retrying in 30 seconds...")
            time.sleep(30)
        else:
            raise e
res

File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...


<FineTuningJob fine_tuning.job id=ftjob-K1x7qq5sFG4tsASJCSu2AxAC at 0x126837ea0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-K1x7qq5sFG4tsASJCSu2AxAC",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1694568819,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-dSLF9Ay5XJvsvCOjYOjUYfQQ",
  "result_files": [],
  "status": "created",
  "validation_file": "file-RqOm21EnUVLA7m22agALfD3b",
  "training_file": "file-O4pTTX6oyp37UpfljotjizoL",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": null,
  "error": null
}

In [8]:

while True:
    res = openai.FineTuningJob.retrieve(job_id)
    if res["finished_at"] != None:
        print(res)
        break
    else:
        print(".", end="")
        time.sleep(100)

ft_model = res["fine_tuned_model"]
ft_model

..{
  "object": "fine_tuning.job",
  "id": "ftjob-K1x7qq5sFG4tsASJCSu2AxAC",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1694568819,
  "finished_at": 1694572774,
  "fine_tuned_model": "ft:gpt-3.5-turbo-0613:personal:mt-lk-jn-to-mk-tes:7yA4tWKQ",
  "organization_id": "org-dSLF9Ay5XJvsvCOjYOjUYfQQ",
  "result_files": [
    "file-im2or2CnzKAmd5szF4Xs3YN8"
  ],
  "status": "succeeded",
  "validation_file": "file-RqOm21EnUVLA7m22agALfD3b",
  "training_file": "file-O4pTTX6oyp37UpfljotjizoL",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": 2583870,
  "error": null
}


'ft:gpt-3.5-turbo-0613:personal:mt-lk-jn-to-mk-tes:7yA4tWKQ'

In [18]:
def create_messages(text, version="eng-web"):
    # if text is a dict we can assume they already formatted it
    if isinstance(text, dict):
        # make sure the last item in messages is not a user message
        if text['messages'][-1]['role'] == 'user':
            text['messages'].pop()
        return text

    if isinstance(text, list):
        text = SEPERATOR.join(text)
    
    line = {
        "messages": [
            {"role": "system", "content": system_message(version)},
            {"role": "user", "content": text},
        ]
    }
    return line['messages']

def translate(text, version="eng-web", debug=False, temperature=0.1):
    """
    Translate text to Birrig
    
    Parameters
    ----------
    text: str|array
        The text to translate. If an array is passed, each element will be translated.
    version: str
        The version of the Bible to translate from. Default is 'eng-web'

    Returns
    ------- 
    array
        An array of translations
        
    """
    if isinstance(text, list):
        if len(text) > 20:
            # break it into chunks of 20 and call translate on each chunk
            # then combine the results
            result = []
            for i in range(0, len(text), 20):
                result += translate(text[i:i+20])
            return result

    messages = create_messages(text, version)
    response = openai.ChatCompletion.create(
        model=ft_model,
        messages=messages,
        temperature=temperature,  # 0.1 is very little randomness/creativity, 2 is very
        max_tokens=2000,
        n=1,
        #logprobs=debug and 5 or 0,
    )
    result = []
    for choice in response.get('choices',[{}]):
        text = choice.get('message', {"content":""}).get("content").strip()
        decoded = substitution_cipher(text, encode=False).strip()
        if isinstance(text, list):
            decoded = decoded.split(SEPERATOR)
        result.append(decoded)
        
    
    return result

In [19]:
translate("he ate locusts and honey", debug=True, temperature=0.1)

['And he took knings and honep.']

In [20]:
translate(["Jesus said", "love one another", "be my disciples"])

['Jesus said, Get youbes for one amother and be my discipres.']

In [11]:
translate(bible[bible['0']=='MRK 1:6'][['eng-web']].values[0][0], debug=False, temperature=0.001)

["And John had a coat of camap's hain and a let of talent away abaut his backe. His food was clear and honey."]

In [12]:
# Get the column eng-web from the first 3 rows of validate then send it to translate
test['translation'] = translate(test['eng-web'].to_list())
test

InvalidRequestError: ['The beginning of the Good News of Jesus Christ, the Son of God.', 'As it is written in the prophets, “Behold, I send my messenger before your face, who will prepare your way before you:', 'the voice of one crying in the wilderness, ‘Make ready the way of the Lord! Make his paths straight!’”', 'John came baptizing in the wilderness and preaching the baptism of repentance for forgiveness of sins.', 'All the country of Judea and all those of Jerusalem went out to him. They were baptized by him in the Jordan river, confessing their sins.', 'John was clothed with camel’s hair and a leather belt around his waist. He ate locusts and wild honey.', 'He preached, saying, “After me comes he who is mightier than I, the thong of whose sandals I am not worthy to stoop down and loosen.', 'I baptized you in water, but he will baptize you in the Holy Spirit.”', 'In those days, Jesus came from Nazareth of Galilee, and was baptized by John in the Jordan.', 'Immediately coming up from the water, he saw the heavens parting and the Spirit descending on him like a dove.', 'A voice came out of the sky, “You are my beloved Son, in whom I am well pleased.”', 'Immediately the Spirit drove him out into the wilderness.', 'He was there in the wilderness forty days, tempted by Satan. He was with the wild animals; and the angels were serving him.', 'Now after John was taken into custody, Jesus came into Galilee, preaching the Good News of God’s Kingdom,', 'and saying, “The time is fulfilled, and God’s Kingdom is at hand! Repent, and believe in the Good News.”', 'Passing along by the sea of Galilee, he saw Simon and Andrew, the brother of Simon, casting a net into the sea, for they were fishermen.', 'Jesus said to them, “Come after me, and I will make you into fishers for men.”', 'Immediately they left their nets, and followed him.', 'Going on a little further from there, he saw James the son of Zebedee, and John his brother, who were also in the boat mending the nets.', 'Immediately he called them, and they left their father, Zebedee, in the boat with the hired servants, and went after him.'] is not of type 'string' - 'messages.1.content'

In [None]:
# drop rows that have a null value in any column
test = test.dropna()
test['bleu_score'] = test.apply(lambda row: sentence_bleu([row['eng-web'].split(), row['eng-asv'].split(), row['eng-kjv2006'].split(),row['engBBE'].split()], row['translation'].split()), axis=1)

In [None]:
# describe the bleu score column
test['bleu_score'].describe()

## Analysis


| Stat | Number
| ----- | ----- |
| Average bleu score |  |
| 75 percentile |  |

In [None]:
# foreach row in validate, print the column "0" and translation
for index, row in test.iterrows():
    print(row['0'], row['translation'])

# Handpicked Tests for experimentation


In [None]:
translate("For God so loved the world that he gave his only Son, so that everyone who believes in him may not die but have eternal life.")

In [None]:
translate("he said let there be light and there was light")



In [None]:
# Try some unknown words
translate("Bongo bongo I love you, gone to Venus with a hole in my shoe")

In [None]:
translate ("Ship Pit! Pirate ahoy-lay!")

In [None]:
translate("Sheep went baa")

In [None]:
translate("Boat a brother on a mountain")

In [None]:
translate("Be kind and play with rocks!")

In [None]:
translate("SOS! Ship overboard!! Lost ninty percent of people!")