# Attempt to fine-tune GPT3.5 to translate between languages
The first language will be our made up language (see notes on get_bible as to why we think GPT is cheating)
The thesis is that fine-tuning will cause GPT3 to learn the fake language as a new language by assigning the embeddings of the
new words essentially to the same dimensional space as the english.  Should be trivial to translate it back as it is a word 
for word translation.



In [39]:
from lib.config import get_config
from lib.cipher import substitution_cipher
import json, random, time, os

# To install pip install pandas, openai, nltk
import pandas as pd
import openai # !pip install openai==0.27.9
from nltk.translate.bleu_score import sentence_bleu

FILENAME = "GPT3-5"
EPOCHS = 3  # Since we are repeating the data with different versions we don't want to overfit
SOURCE_BOOKS = ['MRK']
TARGET_BOOKS = ['MAT']
VERSIONS = ['eng-web', 'eng-asv', 'eng-kjv2006', 'hin2017','arbnav','latVUC','amo']
VERSION_LANGUAGES = ["English (WEB)", "English (ASV)", "English (KJV)", "Hindi", "Arabic", "Latin", "Amo"]
SPLIT_RATIO = 0.8
EXPERIMENT_NAME = "01_mk_to_mt"  # Max 18 characters
FILENAME = FILENAME + "_" + EXPERIMENT_NAME
SEPERATOR = "\n---\n"
TOTAL_SKIP_WORDS = 50  # Skip the x least common words to teach it how to behave on missing words

# set environment variable in ipython notebook
os.environ["OPENAI_API_KEY"] = get_config('openai')['api_key']
openai.api_key = os.getenv("OPENAI_API_KEY")

In [36]:
def system_message(source):
    return f"""
You are an expert translator, the user will ask you to translate from one language to another.
"""

def system_message_all(source):
    return f"""
You are an expert translator, the user will give you a verse to translate and various parallel translations. You need to translate it into Berrig.
"""



In [23]:
bible = pd.read_csv('data/bible.csv')

# Assuming the gospels have a lot of overlap include the synoptic gospels plus John to train
# then predict Mark (which should be easy as Matthew and Luke may have copied from him)
train = bible[bible['book'].isin(SOURCE_BOOKS)]
test = bible[bible['book'].isin(TARGET_BOOKS)]

In [34]:
# Determine the most common words in the train data
from collections import defaultdict
def get_words(df, min_frequency=0, min_length=0):
    word_count = defaultdict(int)
    for index, row in df.iterrows():
        for word in row['birrig'].split():
            # lowercase and remove punctation
            word = word.lower().strip('.,;!?')
            if len(word) >= min_length:
                word_count[word] += 1

    # Sort the words by frequency
    sorted_words = sorted(word_count.items(), key=lambda item: item[1], reverse=True)
    all_words = [(word[0], substitution_cipher(word[0], encode=False).strip(), word[1]) for word in sorted_words]
    
    # reduce words to those that appear more than min_frequency
    return [word for word in all_words if word[2] > min_frequency]

words = get_words(train, min_frequency=10, min_length=4)
print(len(words))
words

163


[('lxij', 'they', 257),
 ('zeow', 'said', 243),
 ('lxin', 'them', 194),
 ('holx', 'with', 155),
 ('hopp', 'will', 144),
 ('xemi', 'have', 121),
 ('lxel', 'that', 119),
 ('hivi', 'were', 116),
 ('keni', 'came', 112),
 ('svun', 'from', 108),
 ('hxir', 'when', 98),
 ('hirl', 'went', 97),
 ('kuni', 'come', 93),
 ('orlu', 'into', 88),
 ('yizaz', 'jesus', 86),
 ('gemi', 'gave', 78),
 ('ehej', 'away', 74),
 ('lxivi', 'there', 68),
 ('juav', 'your', 62),
 ('luuc', 'took', 62),
 ('lxiov', 'their', 62),
 ('newi', 'made', 62),
 ('leci', 'take', 62),
 ('lxuzi', 'those', 60),
 ('gomi', 'give', 59),
 ('hxel', 'what', 59),
 ('lxoz', 'this', 55),
 ('fikeazi', 'because', 48),
 ('hxokx', 'which', 48),
 ('qiuqpi', 'people', 47),
 ('efual', 'about', 46),
 ('lxorgz', 'things', 46),
 ('wozkoqpiz', 'disciples', 45),
 ('gviel', 'great', 43),
 ('zejorg', 'saying', 41),
 ('esliv', 'after', 40),
 ('neci', 'make', 35),
 ('ranfiv', 'number', 35),
 ('gomir', 'given', 33),
 ('fiir', 'been', 33),
 ('imop', 'evil', 33

In [43]:
def write_file(df, file_handlers, versions, split=None):
    
        

        results = []
        # Start off by teaching it some common words
        words = get_words(df, min_frequency=10, min_length=4)
        for word in words:
            results.append((random.random(), {
                "messages": [
                    {"role": "system", "content": system_message('English')},
                    {"role": "user", "content": f"Translate from English to Birrig\n{word[0]}"},
                    {"role": "assistant", "content": word[1]},
                ]
            }))

        skip_words = get_words(df)[TOTAL_SKIP_WORDS*-1:]

        # loop through the train and validate dataframes and add each row to a dataset
        for _, row in df.iterrows():
            # Loop through all the Bible Versions
            source_text = f"{row['book']} {row['chapter']}:{row['verse']}"
            berrig_text = row['birrig']
            for word in skip_words:
                berrig_text = berrig_text.replace(word[0], f"[{word[1]}]")

            for item in versions:
                language = VERSION_LANGUAGES[versions.index(item)]
                if not pd.isna(row[item]) and not pd.isna(row['birrig']):
                    # Create a GPT chat message we will teach GPT how to reply to
                    # Thus learning the new language
                    results.append((random.random(),{
                        "messages": [
                            {"role": "system", "content": system_message(item)},
                            {"role": "user", "content": f"Translate from {language} to Birrig\n{row[item]}"},
                            {"role": "assistant", "content": row['birrig']},
                        ]
                    }))

                    # Create a GPT chat message we will teach GPT how to reply to
                    # Thus learning the new language
                    results.append((random.random(),{
                        "messages": [
                            {"role": "system", "content": system_message(item)},
                            {"role": "user", "content": f"Translate from Birrig to {language}\n{row['birrig']}"},
                            {"role": "assistant", "content": row[item]},
                        ]
                    }))

                    results.append((random.random(),{
                        "messages": [
                            {"role": "system", "content": "You are an expert in the Bible and can quote scripture. Given a Bible verse quote it in Birrig"},
                            {"role": "user", "content": source_text},
                            {"role": "assistant", "content": row['birrig']},
                        ]
                    }))

                    source_texts = f"{item}\t{row[item]}\n"
            
            results.append((random.random(),{
                "messages": [
                    {"role": "system", "content": system_message_all(item)},
                    {"role": "user", "content": source_texts},
                    {"role": "assistant", "content": row['birrig']},
                ]
            }))
           
        
        results.sort(key=lambda x: x[0])
        
        # Remove trailing newline in file by this little cheat
        optional_newline = ["",""]
        for some_random_num, line in results:
            # Write to the file, if split to train (index 0) or validate (index 1)
            if split is None or some_random_num <= split or len(file_handlers) == 1:
                index = 0
            else:
                index = 1
            file_handlers[index].write(optional_newline[index] + json.dumps(line))
            optional_newline[index] = "\n"

with open(f'data/{FILENAME}_train.jsonl','w') as f1, open(f'data/{FILENAME}_validate.jsonl','w') as f2, open(f'data/{FILENAME}_test.jsonl','w') as f3:
    write_file(train, [f1,f2], VERSIONS, SPLIT_RATIO)
    write_file(test, [f3], VERSIONS)


In [44]:
openai.api_key = os.environ["OPENAI_API_KEY"]

files = {}
for part in ['train', 'validate']:
    try:
        res = openai.File.create(
            file=open(f'data/{FILENAME}_{part}.jsonl', "r"),
            purpose='fine-tune'
        )
        files[part] = res['id']
    except Exception as e:
        print(e, part, f'data/{FILENAME}_{part}.jsonl')

files

{'train': 'file-mbedXsPm4fEXEJkxvKGgS4Rq',
 'validate': 'file-VJF8UeJQmBYcqG4ZO0LxLUfi'}

In [45]:
while True:
    try:
        res = openai.FineTuningJob.create(
            training_file=files['train'],
            validation_file=files['validate'],
            model="gpt-3.5-turbo",
            hyperparameters={
                "n_epochs":EPOCHS,
            },
            suffix=EXPERIMENT_NAME[0:18],
        )
        job_id = res["id"]

        break
    except openai.error.InvalidRequestError as e:
        if "File 'file-" in str(e) and "' is still being processed and is not ready to be used for fine-tuning." in str(e):
            print("File is still being processed. Retrying in 30 seconds...")
            time.sleep(30)
        else:
            raise e
res

File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...
File is still being processed. Retrying in 30 seconds...


<FineTuningJob fine_tuning.job id=ftjob-nHgVBPiiQUxSxzk1r1tIiTZZ at 0x1232469a0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-nHgVBPiiQUxSxzk1r1tIiTZZ",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1694652725,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-dSLF9Ay5XJvsvCOjYOjUYfQQ",
  "result_files": [],
  "status": "created",
  "validation_file": "file-VJF8UeJQmBYcqG4ZO0LxLUfi",
  "training_file": "file-mbedXsPm4fEXEJkxvKGgS4Rq",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": null,
  "error": null
}

In [47]:

while True:
    res = openai.FineTuningJob.retrieve(job_id)
    if res["finished_at"] != None:
        print(res)
        break
    else:
        print(".", end="")
        time.sleep(100)

ft_model = res["fine_tuned_model"]
ft_model

{
  "object": "fine_tuning.job",
  "id": "ftjob-nHgVBPiiQUxSxzk1r1tIiTZZ",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1694652725,
  "finished_at": 1694656988,
  "fine_tuned_model": "ft:gpt-3.5-turbo-0613:personal:01-mk-to-mt:7yVzBeO4",
  "organization_id": "org-dSLF9Ay5XJvsvCOjYOjUYfQQ",
  "result_files": [
    "file-6PUB4drqe2Km8e9lnOp6Tp4w"
  ],
  "status": "succeeded",
  "validation_file": "file-VJF8UeJQmBYcqG4ZO0LxLUfi",
  "training_file": "file-mbedXsPm4fEXEJkxvKGgS4Rq",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": 4051686,
  "error": null
}


'ft:gpt-3.5-turbo-0613:personal:01-mk-to-mt:7yVzBeO4'

In [48]:
def create_messages(text, version="eng-web"):
    # if text is a dict we can assume they already formatted it
    if isinstance(text, dict):
        # make sure the last item in messages is not a user message
        if text['messages'][-1]['role'] == 'user':
            text['messages'].pop()
        return text

    if isinstance(text, list):
        text = SEPERATOR.join(text)
    
    line = {
        "messages": [
            {"role": "system", "content": system_message(version)},
            {"role": "user", "content": text},
        ]
    }
    return line['messages']

def translate(text, version="eng-web", debug=False, temperature=0.1):
    """
    Translate text to Birrig
    
    Parameters
    ----------
    text: str|array
        The text to translate. If an array is passed, each element will be translated.
    version: str
        The version of the Bible to translate from. Default is 'eng-web'

    Returns
    ------- 
    array
        An array of translations
        
    """
    if isinstance(text, list):
        if len(text) > 20:
            # break it into chunks of 20 and call translate on each chunk
            # then combine the results
            result = []
            for i in range(0, len(text), 20):
                result += translate(text[i:i+20])
            return result

    messages = create_messages(text, version)
    response = openai.ChatCompletion.create(
        model=ft_model,
        messages=messages,
        temperature=temperature,  # 0.1 is very little randomness/creativity, 2 is very
        max_tokens=2000,
        n=1,
        #logprobs=debug and 5 or 0,
    )
    result = []
    for choice in response.get('choices',[{}]):
        text = choice.get('message', {"content":""}).get("content").strip()
        decoded = substitution_cipher(text, encode=False).strip()
        if isinstance(text, list):
            decoded = decoded.split(SEPERATOR)
        result.append(decoded)
        
    
    return result

In [50]:
translate("he ate locusts and honey", debug=True, temperature=0.1)

['And he said, I have prayer of word and some, and will take no food because it has been orderle of the time when it come.']

In [51]:
translate(["Jesus said", "love one another", "be my disciples"])

['And he said to them, Give spir of men, because I have given you spir of men.']

In [11]:
translate(bible[bible['0']=='MRK 1:6'][['eng-web']].values[0][0], debug=False, temperature=0.001)

["And John had a coat of camap's hain and a let of talent away abaut his backe. His food was clear and honey."]

In [12]:
# Get the column eng-web from the first 3 rows of validate then send it to translate
test['translation'] = translate(test['eng-web'].to_list())
test

InvalidRequestError: ['The beginning of the Good News of Jesus Christ, the Son of God.', 'As it is written in the prophets, “Behold, I send my messenger before your face, who will prepare your way before you:', 'the voice of one crying in the wilderness, ‘Make ready the way of the Lord! Make his paths straight!’”', 'John came baptizing in the wilderness and preaching the baptism of repentance for forgiveness of sins.', 'All the country of Judea and all those of Jerusalem went out to him. They were baptized by him in the Jordan river, confessing their sins.', 'John was clothed with camel’s hair and a leather belt around his waist. He ate locusts and wild honey.', 'He preached, saying, “After me comes he who is mightier than I, the thong of whose sandals I am not worthy to stoop down and loosen.', 'I baptized you in water, but he will baptize you in the Holy Spirit.”', 'In those days, Jesus came from Nazareth of Galilee, and was baptized by John in the Jordan.', 'Immediately coming up from the water, he saw the heavens parting and the Spirit descending on him like a dove.', 'A voice came out of the sky, “You are my beloved Son, in whom I am well pleased.”', 'Immediately the Spirit drove him out into the wilderness.', 'He was there in the wilderness forty days, tempted by Satan. He was with the wild animals; and the angels were serving him.', 'Now after John was taken into custody, Jesus came into Galilee, preaching the Good News of God’s Kingdom,', 'and saying, “The time is fulfilled, and God’s Kingdom is at hand! Repent, and believe in the Good News.”', 'Passing along by the sea of Galilee, he saw Simon and Andrew, the brother of Simon, casting a net into the sea, for they were fishermen.', 'Jesus said to them, “Come after me, and I will make you into fishers for men.”', 'Immediately they left their nets, and followed him.', 'Going on a little further from there, he saw James the son of Zebedee, and John his brother, who were also in the boat mending the nets.', 'Immediately he called them, and they left their father, Zebedee, in the boat with the hired servants, and went after him.'] is not of type 'string' - 'messages.1.content'

In [None]:
# drop rows that have a null value in any column
test = test.dropna()
test['bleu_score'] = test.apply(lambda row: sentence_bleu([row['eng-web'].split(), row['eng-asv'].split(), row['eng-kjv2006'].split(),row['engBBE'].split()], row['translation'].split()), axis=1)

In [None]:
# describe the bleu score column
test['bleu_score'].describe()

## Analysis


| Stat | Number
| ----- | ----- |
| Average bleu score |  |
| 75 percentile |  |

In [None]:
# foreach row in validate, print the column "0" and translation
for index, row in test.iterrows():
    print(row['0'], row['translation'])

# Handpicked Tests for experimentation


In [52]:
translate("For God so loved the world that he gave his only Son, so that everyone who believes in him may not die but have eternal life.")

['For God had sent isportor the worly for the fire that he will get evil in now before without evilent:']

In [53]:
translate("he said let there be light and there was light")



['And He said, Take thene be flice: and thene welfing had for ofdorder.']

In [54]:
# Try some unknown words
translate("Bongo bongo I love you, gone to Venus with a hole in my shoe")

['Tavat ma quest, I saat, Go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go only, go

In [55]:
translate ("Ship Pit! Pirate ahoy-lay!")

['¡Je heurs! he has been given the king because of the God!']

In [56]:
translate("Sheep went baa")

['An of the seate, the spouns said, Prispas! prispas!']

In [57]:
translate("Boat a brother on a mountain")

In [None]:
translate("Be kind and play with rocks!")

In [None]:
translate("SOS! Ship overboard!! Lost ninty percent of people!")