In [None]:
#————————————————————

# Name: OpenAI Assistant (V1)

# Purpose:
# This notebook will create a GPT Assistant using OpenAI's API and provide it with the training dataframe returned by the data engineer Assistant and a set of instructions to creating an "Extra Trees" Random Forest. Basic outline of instructions for the modeler:

# 1. Load the provided dataframe into a pandas df.
# 2. Split the data set into training and testing using a 75:25 split.
# 3. Train an Extra Trees random forest with 2000 trees.
# 4. Use the testing data to measure the model's accuracy, presicion, recall, and generate a confusion matrix.
# 5. Return the results in a single csv table. 

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 22.01.2024
# Python Version: 3.10.4

#General Sources:
#https://platform.openai.com/docs/api-reference?lang=python
#https://medium.com/ai-advances/complete-process-for-fine-tuning-gpt-3-5-turbo-using-openai-api-db4a50b3de1a

#Openai Usage:
#https://platform.openai.com/usage

#Additionals:
# - https://platform.openai.com/docs/models
# - https://openai.com/pricing

# Download Python packages (run the below command in terminal if packages have not yet been installed)
#pip install -r C:\Python\openai-lab\support\requirements\requirements.txt

#————————————————————

In [None]:
# Import required libraries
import os
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
import openai
from openai import OpenAI

import openai
import time
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
from io import StringIO
import io
import json

In [2]:
## Define Functions

def read_and_save_file(first_file_id, file_name):    
    # its binary, so read it and then make it a file like object
    file_data = client.files.content(first_file_id)
    file_data_bytes = file_data.read()
    file_like_object = io.BytesIO(file_data_bytes)
    #now read as csv to create df
    returned_data = pd.read_csv(file_like_object)
    returned_data.to_csv(file_name, index=False)
    return returned_data
    # file = read_and_save_file(first_file_id, "analyst_output.csv")
    
def files_from_messages(messages, asst_name):
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids
    print(message_ids)
    # Loop through each file ID and save the file with a sequential name
    for i, file_id in enumerate(message_ids):
        file_name = f"{asst_name}_output_{i+1}.csv"  # Generate a sequential file name
        read_and_save_file(file_id, file_name)
        print(f'saved {file_name}')    

In [5]:
# set key and assistant ID
OPENAI_API_KEY = 'your_API_key'

# Instantiate the OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# load and check the file for the engineer
asst_file = 'tumor.csv'
df = pd.read_csv(asst_file)

In [None]:
# create the assistant and give it the CSV file

mls = '''
You are a data engineer who will work with data in a csv file in your files. 
When the user asks you to perform your actions, use the csv file to read the data into a pandas dataframe.
The data set is to be used for a classification model.
Execute each of the steps listed below in your ACTIONS section. The user will identify the target variable. 

ACTIONS:

1. Read the file data into a pandas DataFrame. 
2. Summarize each feature and the target variable in the data set and prepare the results as Table_1.
3. Check for missing values and impute the column mean for any missing values.
4. Create a two new feature interaction columns for each unique pair of variables, using multiplication for one interaction column and dividion for the other.
5. Run a logistic regression to predict the target variable with LASSO to select features. Use a lambda values of 1. 
6. Prepare the Lasso coefficient values as Table_2.
7. Prepare a final data set that only contains features with non-zero LASSO coefficients and the target variable as Table_3
8. Provide a summary paragraph explaining the preparation of the data set.
9. Prepare Table_1, Table_2, and Table_3 as csv files for download by the user. 

DO NOT:
1. Do not return any images. 
'''

# send the csv file to the assistant purpose files
response = client.files.create(
  file=open(asst_file, "rb"),
  purpose="assistants"
)
print(response)
file__id = response.id

my_assistant = client.beta.assistants.create(
    instructions=mls,
    name="engine_1",
    tools=[{"type": "code_interpreter"}],
    model="gpt-4-1106-preview", # gpt-4
    file_ids=[file__id]
)

# get the file id
fileId = my_assistant.file_ids[0]
print(my_assistant)

In [None]:
# make the request to the assistant

message_string = "Please execute your ACTIONS on the data stored in the csv file " + fileId + " . The Target variable is Class"
print(message_string)

# Step 2: Create a Thread
thread = client.beta.threads.create()

# Step 3: Add a Message to a Thread
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content= message_string
)

# Step 4: Run the Assistant
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=my_assistant.id
    #instructions="Overwrite hard-coded instructions here"
)

print(run.model_dump_json(indent=4))

while True:
    sec = 60
    # Wait for 5 seconds
    time.sleep(sec)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
    print(f'{sec} seconds later...')
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        break

In [None]:
# extract the file names from the response and retrieve the content
asst_name = 'engineer'        
files_from_messages(messages, asst_name)

In [None]:
df1 = pd.read_csv('engineer_output_1.csv')
display(df1)

df2 = pd.read_csv('engineer_output_2.csv')
display(df2)

df3 = pd.read_csv('engineer_output_3.csv')
display(df3)

In [None]:
response = client.beta.assistants.delete(my_assistant.id)
print(response)

In [None]:
# load and check the file for the engineer
asst_file = 'engineer_output_1.csv'
df = pd.read_csv(asst_file)

display(df)

In [None]:
# create the assistant and give it the CSV file

mls = '''
You are a data scientist who will build a predictive model with data from two csv files uploaded to your files. 
When the user asks you to perform your actions, use the csv file to read the data into a pandas dataframe.
Then continue with each of the steps listed below in you ACTIONS. The user will identify the target variable. 

ACTIONS:

1. Load the engineer_output_1 csv file into a pandas df of the same name.
2. Split the data set into training and testing data sets with a 25% split.
3. Train an Extra Trees random forest with 2000 trees
4. Use the testing data to measure the models accuracy, presicion, recall, and confusion matrix.
5. Format the testing data results as a csv table and prepare it for download by the user. 

DO NOT:
1. Return any images. 
'''

# send the csv file to the assistant purpose files
response = client.files.create(
  file=open(asst_file, "rb"),
  purpose="assistants"
)
print(response)
file_1_id = response.id

my_assistant = client.beta.assistants.create(
    instructions=mls,
    name="modeler_1",
    tools=[{"type": "code_interpreter"}],
    model="gpt-4-1106-preview", # gpt-4
    file_ids=[file_1_id] # multiple files: file_ids=[file_1_id, file_2_id]
)

# get the file id
fileId = my_assistant.file_ids[0]
print(my_assistant)

In [None]:
# make the request to the assistant

message_string = "Please execute your ACTIONS on " + fileId + " and prepare the resulting table for csv download. The Target variable is Class"
print(message_string)

# Step 2: Create a Thread
thread = client.beta.threads.create()

# Step 3: Add a Message to a Thread
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content= message_string
)

# Step 4: Run the Assistant
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=my_assistant.id
    #instructions="Overwrite hard-coded instructions here"
)

print(run.model_dump_json(indent=4))

while True:
    # Wait in between tries
    sec = 60
    time.sleep(sec)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
    print('One eternity later...')
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        break

In [None]:
asst_name = 'modeler'        
files_from_messages(messages, asst_name)

In [None]:
df1 = pd.read_csv('modeler_output_1.csv')
display(df1)

In [None]:
# Clean up the assistant

response = client.beta.assistants.delete(my_assistant.id)
print(response)