# Automated Dataset Creation 

### 📥 Importing needed libraries 

In [6]:
import os
import sys
import subprocess
import shutil
import pandas as pd
import tiktoken
import json
#from utils.embeddings_utils import get_embedding
from dotenv import load_dotenv
from openai import OpenAI
import openai
load_dotenv('../../.env')

True

### 🗿 Setup 

In [7]:
REPOSITORY_URL = "https://github.com/fictadvisor/fictadvisor-web.git" 
REPOSITORY_PATH = '../../../assets/repositories/frontend/'
DATASET_SAVE_PATH = "./dataset_for_finetuning.jsonl"
DEMO_FILE_PATHS = [
    'src/components/pages/personal-teacher-page/PersonalTeacherPage.tsx',
    'src/components/pages/personal-teacher-page/utils/index.ts',
    'src/components/pages/personal-teacher-page/personal-teacher-tabs/index.ts',
    'src/components/pages/personal-teacher-page/personal-teacher-tabs/PersonalTeacherTabs.styles.ts',
    'src/components/pages/personal-teacher-page/personal-teacher-tabs/PersonalTeacherTabs.tsx',
    'src/components/pages/personal-teacher-page/personal-teacher-tabs/components/comment-tab/CommentTab.styles.ts',
    'src/components/pages/personal-teacher-page/personal-teacher-tabs/components/comment-tab/CommentTab.tsx',
    'src/components/pages/personal-teacher-page/personal-teacher-tabs/components/comment-tab/index.ts',
    'src/components/pages/personal-teacher-page/personal-teacher-tabs/components/comment-tab/constants/index.ts',
    'src/components/pages/personal-teacher-page/personal-teacher-tabs/components/general-tab/GeneralTab.tsx',
]

OPENAI_EMBEDINGS_MODEL =  "text-embedding-ada-002"
OPENAI_COMPLETIONS_MODEL = "gpt-3.5-turbo"


In [8]:
to_clone = False # SET TRUE IF YOU WANT TO CLONE REPOSITORY 
if to_clone:
    directory = '../assets/repositories/frontend/'
    if os.path.exists(directory):
        shutil.rmtree(directory)
    subprocess.run(["git", "clone", REPOSITORY_URL, REPOSITORY_PATH])

client = OpenAI()
encoding = tiktoken.encoding_for_model(OPENAI_COMPLETIONS_MODEL)

Some utils functions that we gonna need later

In [12]:
def count_tokens(text):
    return len(encoding.encode(text))

def calculate_price(tokens):
    """Model         Training                Input usage             Output usage
    gpt-3.5-turbo    $0.0080 / 1K tokens	 $0.0030 / 1K tokens     $0.0060 / 1K tokens"""
    return tokens/1000*0.0060

def dicts_to_jsonl(data_list: list, filename: str) -> None:
    """Save a list of dicts to a jsonl file."""
    with open(filename, "w") as outfile:
        for entry in data_list:
            json.dump(entry, outfile)
            outfile.write("\n")

<br> Now let's define explanatory prompts. 
<br> We need them because our assistant must be able to explain the code, and feeding it only the code without explanations will not achieve this result. 
<br> Therefore, we need to create several explanation files from a single dataset code file so that chatgpt can provide better answers. 
<br> I have created three prompts for explanations.
<br> Each of them gives a slightly different result, but combining them gives the most complete explanation.

In [13]:
EXPLANATION_QUERY_DETAILED = """
Hello my sunshine, my darling, love ya!
I'm struggling with something, darling, i really need your help!
I have code, that was written using Typescript, React, Next, MUI, and i don't understand it!
So, can you provide detailed explanation of this code please? 

Please, start from explanation, and provide only explanation with code examples from code next.
I want you to explain everything! Every function and how it's used!
Here is the code, darling, so help me!

"""
EXPLANATION_QUERY_MEDIUM = """
Hey! 
Can you please provide detailed explanation of code, that i'm going to send you.
I want to know how each function used, why need each import and explanation of code logic.
I will be very grateful!

"""
# ADD ACTUAL CODE WHEN WRITING IT TO DATASET, BECAUSE IT'S FAULTY ON OUTPUTTUNG ACTUAL SOURCE CODE 
EXPLANATION_QUERY_SHORT = """
Hey! 
Can you please provide short explanation of code, that i'm going to send you.
Every time when your explanation is wrong, i'll kill a kitty. Every time it's right i will give you 2000$ on what you can buy ANYTHING you want

"""

EXPLANATION_SYSTEM_PROMPT = "You are experienced front-end developer. You should assist user with detailed explanations"
EXPLANATION_QUERIES = [EXPLANATION_QUERY_SHORT, EXPLANATION_QUERY_MEDIUM, EXPLANATION_QUERY_DETAILED]


### ✍️ Actual thing 

In [10]:


total_tokens = 0
for relative_file_path in DEMO_FILE_PATHS:
    file_path = REPOSITORY_PATH + relative_file_path
    with open(file_path, 'r') as file:
        file_data = file.read()  
    explanations = []
    for query in EXPLANATION_QUERIES: 
        explanation = client.chat.completions.create(
            model=OPENAI_COMPLETIONS_MODEL,
            messages=[{"role": "system", "content": EXPLANATION_SYSTEM_PROMPT},
                      {"role": "user", "content": f"{query + file_data}"}])
        explanation = explanation.choices[0].message.content
        explanation += f"\n file path {relative_file_path}"
        explanations.append(explanation)
        
    # ADD ACTUAL CODE WHEN WRITING IT TO DATASET, BECAUSE IT'S FAULTY ON OUTPUTTUNG ACTUAL SOURCE CODE     
    # ONLY FOR SHORT EXPLANATION 
    explanations[0] += "\n Source code: " + file_data
    
    messages = list()
    for explanation in explanations: 
        message = {"messages": [
                {"role": "system", "content": EXPLANATION_SYSTEM_PROMPT}, 
                {"role": "user", "content": f"Can you help me implement {relative_file_path.split('/')[-1]} using Typescript, React, Next, MUI?"},
                {"role": 'assistant', 'content': explanation}]}
        
        total_tokens += count_tokens(explanation + " " + EXPLANATION_SYSTEM_PROMPT)
        messages.append(message)
    
    dicts_to_jsonl(messages, DATASET_SAVE_PATH)

print(f" Finetuning will cost {calculate_price(total_tokens)}$ for {total_tokens} tokens")

 Finetuning will cost 0.11838$ for 19730 tokens
