In [24]:
import pandas as pd
import os
import json
from sklearn.model_selection import train_test_split

In [2]:
save_reports = './data/reports_saved.csv'

df = pd.read_csv(save_reports)
df = df[['name', 'definition']]
df

Unnamed: 0,name,definition
0,Abbrev by day,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."
1,Full Performance Summary,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."
2,Abbrev by merchant,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."
3,FPS By Day,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."
4,FPS by Affiliate Default Website,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."
...,...,...
1430,test report for phpunit,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."
1431,test report for phpunit,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."
1432,test report for phpunit,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."
1433,test report for phpunit,"{""performance_summary"":{""cols"":[{""id"":""fact_im..."


In [3]:
def remove_duplicate_definitions(df):
    # Convert the nested JSON objects to strings
    df['definition_str'] = df['definition'].apply(lambda x: json.dumps(x, sort_keys=True))

    # Identify unique and duplicated strings
    unique_definitions = df['definition_str'].duplicated(keep=False)

    # Filter the DataFrame to keep only rows with unique 'definition's
    df_unique = df[~unique_definitions]

    # Drop the 'definition_str' column as it's no longer needed
    df_unique = df_unique.drop(columns=['definition_str'])

    return df_unique

# Call the function
df_unique = remove_duplicate_definitions(df)

In [4]:
df_unique

In [6]:
df_unique.rename(columns={'definition': 'code'}, inplace=True)

In [29]:
system_prompt = """
Title: Human Readable Query From JSON

Objective: Generate a human-readable query from the provided JSON object which represents a data request for a performance summary report by affiliate.

JSON Structure Overview:
- report_name: Specifies the main objective of the query.
- filters: Contains date or time-related information and specific entities being queried.
- cols: Contains metrics and dimensions to be included in the report.
- format, limit, offset: Specifies how the data should be formatted and how many records are to be displayed.

Instructions:

1. Identify the Main Objective:
   - Extract the value of the report_name key to understand the main objective of the query.

2. Identify Date or Time Frame:
   - Look for the filters key and extract date or time-related information. Check for keys like dim_date-mm_dd_yyyy, op, and period to understand the time frame being queried.

3. Identify Specific Entities:
   - Within the filters key, look for specific entities being queried, like dim_merchant-merchant_uuid and extract the value to identify the specific merchant being queried.

4. Identify Metrics and Dimensions:
   - Look for the cols key and extract the metrics and dimensions to be included in the report. Metrics and dimensions usually have keys like name and alias for human-readable references.

5. Identify Format and Limits:
   - Extract the values of keys like format, limit, and offset to understand how the data should be formatted and how many records are to be displayed.

6. Compose the Query:
   - Start with the main objective, mention the date or time frame, specify the entities, list down the metrics and dimensions, and mention the format and limits.

7. Review and Refine:
   - Review the composed query to ensure it captures all the necessary information from the JSON.
   - Refine the wording to ensure clarity and readability.

8. Do Not Inlude Anything besides the Query in your response.

9. Do Not Specify the data must be returned in JSON format, it is implied.
"""

In [16]:
df_unique = df_unique.sort_values(by='name')

In [25]:
df1, df2 = train_test_split(df_unique, test_size=0.5, random_state=42, shuffle=True)

In [68]:
import time
import openai
from tqdm import tqdm
import pandas as pd

class TokenManager:
    def __init__(self, maximum_tokens_per_minute=10000):
        self.maximum_tokens_per_minute = maximum_tokens_per_minute
        self.tokens_utilized_in_the_last_minute = 0
        self.timestamp_of_the_last_request = time.time()
        self.total_tokens_used = 0  # Initialize total tokens used to 0

    def _reset_tokens_if_needed(self):
        current_timestamp = time.time()
        time_elapsed_since_last_request = current_timestamp - self.timestamp_of_the_last_request
        
        if time_elapsed_since_last_request >= 60:
            self.tokens_utilized_in_the_last_minute = 0
            self.timestamp_of_the_last_request = current_timestamp

    def calculate_tokens_available(self):
        self._reset_tokens_if_needed()
        tokens_available = self.maximum_tokens_per_minute - self.tokens_utilized_in_the_last_minute
        return tokens_available

    def register_tokens_used(self, tokens_used):
        if not isinstance(tokens_used, int) or tokens_used < 0:
            raise ValueError("Invalid token count")
        
        self._reset_tokens_if_needed()
        self.tokens_utilized_in_the_last_minute += tokens_used  # Update the tokens utilized in the last minute
        self.total_tokens_used += tokens_used  # Update the total tokens used



In [69]:

class OpenAIInteraction:
    def __init__(self, token_manager, output_csv, model="gpt-4"):
        self.token_manager = token_manager
        self.model = model
        self.output_csv = output_csv

    def request_response(self, code_snippet, system_prompt, docstring=None):
        while self.token_manager.calculate_tokens_available() < 2500:  # Assume a minimum of 2500 tokens per request
            time.sleep(1)
            print('Waiting for tokens to be available...')

        full_prompt = f"{system_prompt}"

        messages = [
            {"role": "system", "content": full_prompt},
            {"role": "user", "content": code_snippet}
        ]

        try:
            response = openai.ChatCompletion.create(
                model=self.model,  
                messages=messages
            )
            tokens_used = response['usage']['total_tokens']
            self.token_manager.register_tokens_used(tokens_used)
            print(f'Response received, {tokens_used} tokens used, {self.token_manager.total_tokens_used} tokens used in total this run.')
            return response
        except Exception as e:
            print(f"An error occurred: {str(e)}")

    def process_code_snippets(self, dataframe, target_col, system_prompt):
        if target_col not in dataframe.columns:
            print(f"Error: {target_col} does not exist in the dataframe")
            return

        # Check if the output CSV file exists, if not, create it
        if not os.path.exists(self.output_csv):
            with open(self.output_csv, 'w') as f:
                f.write(','.join(dataframe.columns.tolist() + ['output']) + '\n')  # Write headers

        total_snippets = len(dataframe)
        print(f'Processing {total_snippets} code snippets...')

        for _, row in tqdm(dataframe.iterrows(), total=total_snippets):  # Using tqdm for progress bar
            # Read the output CSV to check if the row has been processed
            output_df = pd.read_csv(self.output_csv)
            if any(output_df[target_col] == row[target_col]):
                print(f'Skipping row {_}, already processed.')
                continue  # Skip this row if it has been processed

            response = self.request_response(row[target_col], system_prompt)
            if response:
                # Append the new data to the output CSV
                row_output = pd.DataFrame([row.tolist() + [response]], columns=dataframe.columns.tolist() + ['output'])
                row_output.to_csv(self.output_csv, mode='a', header=False, index=False)
            else:
                print(f'Failed to process row {_}.')

    def aggregate_row_data(self, row, columns=None):
        if columns is None:
            columns = row.index.tolist()
        return '\n'.join([f"{col}: {row[col]}" for col in columns])

    def row_already_processed(self, row, unique_cols):
        if not os.path.exists(self.output_csv):
            return False
        output_df = pd.read_csv(self.output_csv)
        for _, output_row in output_df.iterrows():
            if all(str(row[col]) == str(output_row[col]) for col in unique_cols):
                return True
        return False

    def process_database_rows(self, dataframe, system_prompt,columns=None, unique_cols=None):
        if not os.path.exists(self.output_csv):
            with open(self.output_csv, 'w') as f:
                f.write(','.join(dataframe.columns.tolist() + ['output']) + '\n')  # Write headers

        if unique_cols is None:
            unique_cols = dataframe.columns.tolist()

        total_rows = len(dataframe)
        print(f'Processing {total_rows} database rows...')

        for _, row in tqdm(dataframe.iterrows(), total=total_rows):
            if self.row_already_processed(row, unique_cols):
                print(f'Skipping row {_}, already processed.')
                continue

            # Aggregate row data into a single string document
            user_prompt = self.aggregate_row_data(row, columns)

            response = self.request_response(user_prompt, system_prompt)
            if response:
                output_content = response.get('choices', [{}])[0].get('message', {}).get('content', '')
                # Append the new data to the output CSV
                row_output = pd.DataFrame([row.tolist() + [output_content]], columns=dataframe.columns.tolist() + ['output'])
                row_output.to_csv(self.output_csv, mode='a', header=False, index=False)
            else:
                print(f'Failed to process row {_}.')

        print('Processing completed.')

In [40]:
# Usage
# token_manager = TokenManager()
# openai_interaction = OpenAIInteraction(token_manager)
# openai_interaction.process_code_snippets(df1, system_prompt)

## Lets Grab the prepared columns and create a RAG retrieval system

In [107]:
prepared_columns = pd.read_csv('./data/reports_prepared_columns.csv')
new_token_manager = TokenManager()
new_openai_interaction = OpenAIInteraction(new_token_manager, './data/prepared_columns_processed_v200.csv')
prepared_columns_system_prompt = '''
Please provide a very short description (15 words or less) for the following rows based on the provided data.
Dont mention the environment, visible to, and accesible to. 
- If the "display_groups"={metrics} (and NOTHING ElSE), you should mention Avantmetrics in the Description returned
- If "display_groups"={performance} (and NOTHING ELSE), you should mention Performance in the Description returned.
- Otherwise you dont need to mention the display group in the description returned.
'''
columns_to_aggregate = ['name', 'definition', 'description', 'display_groups', 'table_id', 'classifier']
new_openai_interaction.process_database_rows(prepared_columns, prepared_columns_system_prompt, columns=columns_to_aggregate)

Processing 165 database rows...


  1%|          | 1/165 [00:02<06:07,  2.24s/it]

Response received, 199 tokens used, 199 tokens used in total this run.


  1%|          | 2/165 [00:04<05:51,  2.15s/it]

Response received, 179 tokens used, 378 tokens used in total this run.


  2%|▏         | 3/165 [00:07<06:33,  2.43s/it]

Response received, 189 tokens used, 567 tokens used in total this run.


  2%|▏         | 4/165 [00:08<05:44,  2.14s/it]

Response received, 192 tokens used, 759 tokens used in total this run.


  3%|▎         | 5/165 [00:10<05:05,  1.91s/it]

Response received, 196 tokens used, 955 tokens used in total this run.


  4%|▎         | 6/165 [00:11<04:51,  1.83s/it]

Response received, 189 tokens used, 1144 tokens used in total this run.


  4%|▍         | 7/165 [00:13<04:54,  1.87s/it]

Response received, 204 tokens used, 1348 tokens used in total this run.


  5%|▍         | 8/165 [00:15<05:02,  1.93s/it]

Response received, 176 tokens used, 1524 tokens used in total this run.


  5%|▌         | 9/165 [00:17<04:46,  1.83s/it]

Response received, 199 tokens used, 1723 tokens used in total this run.


  6%|▌         | 10/165 [00:20<05:52,  2.27s/it]

Response received, 312 tokens used, 2035 tokens used in total this run.


  7%|▋         | 11/165 [00:22<05:42,  2.22s/it]

Response received, 190 tokens used, 2225 tokens used in total this run.


  7%|▋         | 12/165 [00:23<04:41,  1.84s/it]

Response received, 183 tokens used, 2408 tokens used in total this run.


  8%|▊         | 13/165 [00:26<05:06,  2.02s/it]

Response received, 180 tokens used, 2588 tokens used in total this run.


  8%|▊         | 14/165 [00:27<04:19,  1.72s/it]

Response received, 185 tokens used, 2773 tokens used in total this run.


  9%|▉         | 15/165 [00:28<03:48,  1.52s/it]

Response received, 176 tokens used, 2949 tokens used in total this run.


 10%|▉         | 16/165 [00:30<03:56,  1.59s/it]

Response received, 184 tokens used, 3133 tokens used in total this run.


 10%|█         | 17/165 [00:33<05:07,  2.08s/it]

Response received, 186 tokens used, 3319 tokens used in total this run.


 11%|█         | 18/165 [00:35<05:07,  2.09s/it]

Response received, 193 tokens used, 3512 tokens used in total this run.


 12%|█▏        | 19/165 [00:37<04:59,  2.05s/it]

Response received, 184 tokens used, 3696 tokens used in total this run.


 12%|█▏        | 20/165 [00:39<05:12,  2.15s/it]

Response received, 187 tokens used, 3883 tokens used in total this run.


 13%|█▎        | 21/165 [00:41<05:03,  2.11s/it]

Response received, 202 tokens used, 4085 tokens used in total this run.


 13%|█▎        | 22/165 [00:44<05:18,  2.23s/it]

Response received, 177 tokens used, 4262 tokens used in total this run.


 14%|█▍        | 23/165 [00:46<05:08,  2.17s/it]

Response received, 192 tokens used, 4454 tokens used in total this run.


 15%|█▍        | 24/165 [00:49<05:32,  2.35s/it]

Response received, 190 tokens used, 4644 tokens used in total this run.


 15%|█▌        | 25/165 [00:50<04:53,  2.10s/it]

Response received, 192 tokens used, 4836 tokens used in total this run.


 16%|█▌        | 26/165 [00:52<04:41,  2.02s/it]

Response received, 234 tokens used, 5070 tokens used in total this run.


 16%|█▋        | 27/165 [00:55<05:31,  2.40s/it]

Response received, 193 tokens used, 5263 tokens used in total this run.


 17%|█▋        | 28/165 [00:57<05:12,  2.28s/it]

Response received, 204 tokens used, 5467 tokens used in total this run.


 18%|█▊        | 29/165 [00:59<04:41,  2.07s/it]

Response received, 183 tokens used, 5650 tokens used in total this run.


 18%|█▊        | 30/165 [01:02<05:03,  2.25s/it]

Response received, 208 tokens used, 5858 tokens used in total this run.


 19%|█▉        | 31/165 [01:04<05:04,  2.27s/it]

Response received, 196 tokens used, 6054 tokens used in total this run.


 19%|█▉        | 32/165 [01:05<04:22,  1.97s/it]

Response received, 266 tokens used, 6320 tokens used in total this run.


 20%|██        | 33/165 [01:07<04:31,  2.06s/it]

Response received, 186 tokens used, 6506 tokens used in total this run.


 21%|██        | 34/165 [01:10<04:30,  2.06s/it]

Response received, 208 tokens used, 6714 tokens used in total this run.


 21%|██        | 35/165 [01:10<03:42,  1.71s/it]

Response received, 204 tokens used, 6918 tokens used in total this run.


 22%|██▏       | 36/165 [01:12<03:25,  1.59s/it]

Response received, 192 tokens used, 7110 tokens used in total this run.


 22%|██▏       | 37/165 [01:14<04:04,  1.91s/it]

Response received, 193 tokens used, 7303 tokens used in total this run.


 23%|██▎       | 38/165 [01:16<03:43,  1.76s/it]

Response received, 200 tokens used, 7503 tokens used in total this run.


 24%|██▎       | 39/165 [01:18<03:40,  1.75s/it]

Response received, 203 tokens used, 7706 tokens used in total this run.


 24%|██▍       | 40/165 [01:19<03:30,  1.68s/it]

Response received, 195 tokens used, 7901 tokens used in total this run.


 25%|██▍       | 41/165 [01:21<03:24,  1.65s/it]

Response received, 194 tokens used, 8095 tokens used in total this run.


 25%|██▌       | 42/165 [01:23<04:04,  1.99s/it]

Response received, 208 tokens used, 8303 tokens used in total this run.


 26%|██▌       | 43/165 [01:25<03:31,  1.73s/it]

Response received, 198 tokens used, 8501 tokens used in total this run.


 27%|██▋       | 44/165 [01:26<03:13,  1.60s/it]

Response received, 190 tokens used, 8691 tokens used in total this run.


 27%|██▋       | 45/165 [01:27<03:07,  1.56s/it]

Response received, 175 tokens used, 8866 tokens used in total this run.


 28%|██▊       | 46/165 [01:28<02:49,  1.43s/it]

Response received, 188 tokens used, 9054 tokens used in total this run.


 28%|██▊       | 47/165 [01:30<03:05,  1.57s/it]

Response received, 190 tokens used, 9244 tokens used in total this run.


 29%|██▉       | 48/165 [01:32<03:18,  1.70s/it]

Response received, 187 tokens used, 9431 tokens used in total this run.


 30%|██▉       | 49/165 [01:34<03:02,  1.57s/it]

Response received, 175 tokens used, 9606 tokens used in total this run.


 30%|███       | 50/165 [01:35<02:44,  1.43s/it]

Response received, 177 tokens used, 9783 tokens used in total this run.


 31%|███       | 51/165 [01:36<02:50,  1.50s/it]

Response received, 200 tokens used, 9983 tokens used in total this run.


 32%|███▏      | 52/165 [01:38<02:42,  1.44s/it]

Response received, 195 tokens used, 10178 tokens used in total this run.


 32%|███▏      | 53/165 [01:40<03:05,  1.66s/it]

Response received, 276 tokens used, 10454 tokens used in total this run.


 33%|███▎      | 54/165 [01:42<03:31,  1.91s/it]

Response received, 180 tokens used, 10634 tokens used in total this run.


 33%|███▎      | 55/165 [01:45<03:56,  2.15s/it]

Response received, 186 tokens used, 10820 tokens used in total this run.


 34%|███▍      | 56/165 [01:47<03:38,  2.01s/it]

Response received, 184 tokens used, 11004 tokens used in total this run.


 35%|███▍      | 57/165 [01:49<03:37,  2.01s/it]

Response received, 180 tokens used, 11184 tokens used in total this run.


 35%|███▌      | 58/165 [01:50<03:22,  1.89s/it]

Response received, 184 tokens used, 11368 tokens used in total this run.


 36%|███▌      | 59/165 [01:52<03:14,  1.83s/it]

Response received, 185 tokens used, 11553 tokens used in total this run.


 36%|███▋      | 60/165 [01:54<03:18,  1.89s/it]

Response received, 219 tokens used, 11772 tokens used in total this run.


 37%|███▋      | 61/165 [01:56<03:24,  1.97s/it]

Response received, 182 tokens used, 11954 tokens used in total this run.


 38%|███▊      | 62/165 [01:59<03:44,  2.18s/it]

Response received, 185 tokens used, 12139 tokens used in total this run.


 38%|███▊      | 63/165 [02:00<03:25,  2.01s/it]

Response received, 187 tokens used, 12326 tokens used in total this run.


 39%|███▉      | 64/165 [02:02<02:57,  1.76s/it]

Response received, 175 tokens used, 12501 tokens used in total this run.


 39%|███▉      | 65/165 [02:03<02:38,  1.58s/it]

Response received, 190 tokens used, 12691 tokens used in total this run.


 40%|████      | 66/165 [02:04<02:36,  1.58s/it]

Response received, 183 tokens used, 12874 tokens used in total this run.


 41%|████      | 67/165 [02:06<02:22,  1.45s/it]

Response received, 205 tokens used, 13079 tokens used in total this run.


 41%|████      | 68/165 [02:07<02:32,  1.57s/it]

Response received, 179 tokens used, 13258 tokens used in total this run.


 42%|████▏     | 69/165 [02:09<02:34,  1.61s/it]

Response received, 192 tokens used, 13450 tokens used in total this run.


 42%|████▏     | 70/165 [02:10<02:13,  1.41s/it]

Response received, 185 tokens used, 13635 tokens used in total this run.


 43%|████▎     | 71/165 [02:11<02:08,  1.37s/it]

Response received, 184 tokens used, 13819 tokens used in total this run.


 44%|████▎     | 72/165 [02:13<02:11,  1.41s/it]

Response received, 188 tokens used, 14007 tokens used in total this run.


 44%|████▍     | 73/165 [02:14<02:14,  1.46s/it]

Response received, 181 tokens used, 14188 tokens used in total this run.


 45%|████▍     | 74/165 [02:16<02:18,  1.53s/it]

Response received, 187 tokens used, 14375 tokens used in total this run.


 45%|████▌     | 75/165 [02:18<02:14,  1.50s/it]

Response received, 179 tokens used, 14554 tokens used in total this run.


 46%|████▌     | 76/165 [02:19<02:14,  1.51s/it]

Response received, 190 tokens used, 14744 tokens used in total this run.


 47%|████▋     | 77/165 [02:20<02:04,  1.42s/it]

Response received, 199 tokens used, 14943 tokens used in total this run.


 47%|████▋     | 78/165 [02:22<02:23,  1.65s/it]

Response received, 201 tokens used, 15144 tokens used in total this run.


 48%|████▊     | 79/165 [02:25<02:56,  2.05s/it]

Response received, 202 tokens used, 15346 tokens used in total this run.


 48%|████▊     | 80/165 [02:28<03:02,  2.15s/it]

Response received, 186 tokens used, 15532 tokens used in total this run.


 49%|████▉     | 81/165 [02:30<03:05,  2.20s/it]

Response received, 195 tokens used, 15727 tokens used in total this run.


 50%|████▉     | 82/165 [02:32<02:56,  2.13s/it]

Response received, 206 tokens used, 15933 tokens used in total this run.


 50%|█████     | 83/165 [02:35<03:08,  2.30s/it]

Response received, 268 tokens used, 16201 tokens used in total this run.


 51%|█████     | 84/165 [02:36<02:43,  2.01s/it]

Response received, 199 tokens used, 16400 tokens used in total this run.


 52%|█████▏    | 85/165 [02:38<02:35,  1.94s/it]

Response received, 198 tokens used, 16598 tokens used in total this run.


 52%|█████▏    | 86/165 [02:39<02:18,  1.75s/it]

Response received, 181 tokens used, 16779 tokens used in total this run.


 53%|█████▎    | 87/165 [02:43<03:02,  2.34s/it]

Response received, 330 tokens used, 17109 tokens used in total this run.


 53%|█████▎    | 88/165 [02:44<02:41,  2.10s/it]

Response received, 634 tokens used, 17743 tokens used in total this run.


 54%|█████▍    | 89/165 [02:47<02:54,  2.30s/it]

Response received, 236 tokens used, 17979 tokens used in total this run.


 55%|█████▍    | 90/165 [02:50<02:53,  2.31s/it]

Response received, 193 tokens used, 18172 tokens used in total this run.


 55%|█████▌    | 91/165 [02:51<02:40,  2.16s/it]

Response received, 198 tokens used, 18370 tokens used in total this run.


 56%|█████▌    | 92/165 [02:53<02:34,  2.11s/it]

Response received, 236 tokens used, 18606 tokens used in total this run.


 56%|█████▋    | 93/165 [02:56<02:39,  2.22s/it]

Response received, 229 tokens used, 18835 tokens used in total this run.


 57%|█████▋    | 94/165 [02:58<02:25,  2.05s/it]

Response received, 292 tokens used, 19127 tokens used in total this run.


 58%|█████▊    | 95/165 [02:59<02:09,  1.85s/it]

Response received, 297 tokens used, 19424 tokens used in total this run.


 58%|█████▊    | 96/165 [03:01<02:23,  2.08s/it]

Response received, 222 tokens used, 19646 tokens used in total this run.


 59%|█████▉    | 97/165 [03:05<02:42,  2.39s/it]

Response received, 207 tokens used, 19853 tokens used in total this run.


 59%|█████▉    | 98/165 [03:07<02:47,  2.51s/it]

Response received, 242 tokens used, 20095 tokens used in total this run.


 60%|██████    | 99/165 [03:10<02:37,  2.39s/it]

Response received, 224 tokens used, 20319 tokens used in total this run.


 61%|██████    | 100/165 [03:11<02:20,  2.17s/it]

Response received, 215 tokens used, 20534 tokens used in total this run.


 61%|██████    | 101/165 [03:12<01:58,  1.85s/it]

Response received, 235 tokens used, 20769 tokens used in total this run.


 62%|██████▏   | 102/165 [03:14<01:45,  1.68s/it]

Response received, 233 tokens used, 21002 tokens used in total this run.


 62%|██████▏   | 103/165 [03:15<01:48,  1.75s/it]

Response received, 237 tokens used, 21239 tokens used in total this run.


 63%|██████▎   | 104/165 [03:18<02:08,  2.11s/it]

Response received, 249 tokens used, 21488 tokens used in total this run.


 64%|██████▎   | 105/165 [03:20<01:52,  1.87s/it]

Response received, 239 tokens used, 21727 tokens used in total this run.


 64%|██████▍   | 106/165 [03:22<01:56,  1.97s/it]

Response received, 213 tokens used, 21940 tokens used in total this run.


 65%|██████▍   | 107/165 [03:25<02:12,  2.28s/it]

Response received, 235 tokens used, 22175 tokens used in total this run.


 65%|██████▌   | 108/165 [03:27<02:07,  2.24s/it]

Response received, 260 tokens used, 22435 tokens used in total this run.


 66%|██████▌   | 109/165 [03:29<01:59,  2.14s/it]

Response received, 253 tokens used, 22688 tokens used in total this run.


 67%|██████▋   | 110/165 [03:33<02:22,  2.59s/it]

Response received, 400 tokens used, 23088 tokens used in total this run.


 67%|██████▋   | 111/165 [03:35<02:18,  2.56s/it]

Response received, 430 tokens used, 23518 tokens used in total this run.


 68%|██████▊   | 112/165 [03:37<02:00,  2.28s/it]

Response received, 291 tokens used, 23809 tokens used in total this run.


 68%|██████▊   | 113/165 [03:39<02:04,  2.39s/it]

Response received, 193 tokens used, 24002 tokens used in total this run.


 69%|██████▉   | 114/165 [03:43<02:20,  2.76s/it]

Response received, 284 tokens used, 24286 tokens used in total this run.


 70%|██████▉   | 115/165 [03:46<02:28,  2.97s/it]

Response received, 299 tokens used, 24585 tokens used in total this run.


 70%|███████   | 116/165 [03:49<02:13,  2.72s/it]

Response received, 637 tokens used, 25222 tokens used in total this run.


 71%|███████   | 117/165 [03:50<01:51,  2.32s/it]

Response received, 290 tokens used, 25512 tokens used in total this run.


 72%|███████▏  | 118/165 [03:51<01:36,  2.06s/it]

Response received, 283 tokens used, 25795 tokens used in total this run.


 72%|███████▏  | 119/165 [03:53<01:27,  1.89s/it]

Response received, 423 tokens used, 26218 tokens used in total this run.


 73%|███████▎  | 120/165 [03:54<01:20,  1.78s/it]

Response received, 203 tokens used, 26421 tokens used in total this run.


 73%|███████▎  | 121/165 [03:56<01:12,  1.65s/it]

Response received, 196 tokens used, 26617 tokens used in total this run.


 74%|███████▍  | 122/165 [03:57<01:07,  1.56s/it]

Response received, 310 tokens used, 26927 tokens used in total this run.


 75%|███████▍  | 123/165 [04:00<01:16,  1.81s/it]

Response received, 179 tokens used, 27106 tokens used in total this run.


 75%|███████▌  | 124/165 [04:01<01:07,  1.64s/it]

Response received, 196 tokens used, 27302 tokens used in total this run.
Waiting for tokens to be available...
Waiting for tokens to be available...
Waiting for tokens to be available...
Waiting for tokens to be available...


 76%|███████▌  | 125/165 [04:08<02:06,  3.17s/it]

Response received, 272 tokens used, 27574 tokens used in total this run.


 76%|███████▋  | 126/165 [04:10<01:58,  3.03s/it]

Response received, 273 tokens used, 27847 tokens used in total this run.


 77%|███████▋  | 127/165 [04:12<01:42,  2.70s/it]

Response received, 267 tokens used, 28114 tokens used in total this run.


 78%|███████▊  | 128/165 [04:14<01:34,  2.55s/it]

Response received, 541 tokens used, 28655 tokens used in total this run.


 78%|███████▊  | 129/165 [04:17<01:31,  2.54s/it]

Response received, 286 tokens used, 28941 tokens used in total this run.


 79%|███████▉  | 130/165 [04:21<01:40,  2.88s/it]

Response received, 180 tokens used, 29121 tokens used in total this run.


 79%|███████▉  | 131/165 [04:22<01:27,  2.56s/it]

Response received, 319 tokens used, 29440 tokens used in total this run.


 80%|████████  | 132/165 [04:24<01:16,  2.31s/it]

Response received, 231 tokens used, 29671 tokens used in total this run.


 81%|████████  | 133/165 [04:26<01:08,  2.15s/it]

Response received, 402 tokens used, 30073 tokens used in total this run.


 81%|████████  | 134/165 [04:28<01:05,  2.13s/it]

Response received, 325 tokens used, 30398 tokens used in total this run.


 82%|████████▏ | 135/165 [04:30<01:03,  2.12s/it]

Response received, 572 tokens used, 30970 tokens used in total this run.


 82%|████████▏ | 136/165 [04:34<01:20,  2.77s/it]

Response received, 263 tokens used, 31233 tokens used in total this run.


 83%|████████▎ | 137/165 [04:37<01:14,  2.66s/it]

Response received, 247 tokens used, 31480 tokens used in total this run.


 84%|████████▎ | 138/165 [04:39<01:08,  2.52s/it]

Response received, 316 tokens used, 31796 tokens used in total this run.


 84%|████████▍ | 139/165 [04:40<00:57,  2.22s/it]

Response received, 240 tokens used, 32036 tokens used in total this run.


 85%|████████▍ | 140/165 [04:44<01:08,  2.73s/it]

Response received, 660 tokens used, 32696 tokens used in total this run.


 85%|████████▌ | 141/165 [04:46<00:57,  2.40s/it]

Response received, 280 tokens used, 32976 tokens used in total this run.


 86%|████████▌ | 142/165 [04:48<00:50,  2.19s/it]

Response received, 193 tokens used, 33169 tokens used in total this run.


 87%|████████▋ | 143/165 [04:49<00:42,  1.95s/it]

Response received, 195 tokens used, 33364 tokens used in total this run.


 87%|████████▋ | 144/165 [04:51<00:41,  1.96s/it]

Response received, 202 tokens used, 33566 tokens used in total this run.


 88%|████████▊ | 145/165 [04:53<00:38,  1.94s/it]

Response received, 204 tokens used, 33770 tokens used in total this run.


 88%|████████▊ | 146/165 [04:56<00:42,  2.23s/it]

Response received, 675 tokens used, 34445 tokens used in total this run.


 89%|████████▉ | 147/165 [04:58<00:38,  2.15s/it]

Response received, 515 tokens used, 34960 tokens used in total this run.
Waiting for tokens to be available...
Waiting for tokens to be available...
Waiting for tokens to be available...
Waiting for tokens to be available...
Waiting for tokens to be available...
Waiting for tokens to be available...
Waiting for tokens to be available...


 90%|████████▉ | 148/165 [05:07<01:13,  4.33s/it]

Response received, 518 tokens used, 35478 tokens used in total this run.


 90%|█████████ | 149/165 [05:09<00:57,  3.56s/it]

Response received, 540 tokens used, 36018 tokens used in total this run.


 91%|█████████ | 150/165 [05:11<00:45,  3.04s/it]

Response received, 929 tokens used, 36947 tokens used in total this run.


 92%|█████████▏| 151/165 [05:12<00:35,  2.54s/it]

Response received, 201 tokens used, 37148 tokens used in total this run.


 92%|█████████▏| 152/165 [05:14<00:30,  2.33s/it]

Response received, 202 tokens used, 37350 tokens used in total this run.


 93%|█████████▎| 153/165 [05:16<00:25,  2.09s/it]

Response received, 203 tokens used, 37553 tokens used in total this run.


 93%|█████████▎| 154/165 [05:17<00:19,  1.77s/it]

Response received, 208 tokens used, 37761 tokens used in total this run.


 94%|█████████▍| 155/165 [05:18<00:15,  1.56s/it]

Response received, 206 tokens used, 37967 tokens used in total this run.


 95%|█████████▍| 156/165 [05:19<00:14,  1.57s/it]

Response received, 201 tokens used, 38168 tokens used in total this run.


 95%|█████████▌| 157/165 [05:21<00:13,  1.74s/it]

Response received, 198 tokens used, 38366 tokens used in total this run.


 96%|█████████▌| 158/165 [05:24<00:12,  1.84s/it]

Response received, 184 tokens used, 38550 tokens used in total this run.


 96%|█████████▋| 159/165 [05:24<00:09,  1.58s/it]

Response received, 204 tokens used, 38754 tokens used in total this run.


 97%|█████████▋| 160/165 [05:26<00:07,  1.51s/it]

Response received, 335 tokens used, 39089 tokens used in total this run.


 98%|█████████▊| 161/165 [05:28<00:07,  1.80s/it]

Response received, 303 tokens used, 39392 tokens used in total this run.


 98%|█████████▊| 162/165 [05:30<00:05,  1.83s/it]

Response received, 305 tokens used, 39697 tokens used in total this run.


 99%|█████████▉| 163/165 [05:32<00:03,  1.74s/it]

Response received, 333 tokens used, 40030 tokens used in total this run.


 99%|█████████▉| 164/165 [05:35<00:02,  2.12s/it]

Response received, 192 tokens used, 40222 tokens used in total this run.


100%|██████████| 165/165 [05:37<00:00,  2.05s/it]

Response received, 184 tokens used, 40406 tokens used in total this run.
Processing completed.





In [113]:
described_prepared_columns = pd.read_csv('./data/prepared_columns_processed_v200.csv')

def safe_json_loads(s):
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        return None
    
def extract_content(output):
    # Try to load the string as JSON
    try:
        output_dict = json.loads(output)
        # Try to extract the "content" field
        if 'choices' in output_dict and output_dict['choices']:
            return output_dict['choices'][0]['message']['content']
    except (json.JSONDecodeError, TypeError, KeyError):
        # If it's not a JSON string or if the extraction fails, return the original string
        return output
    
described_prepared_columns = described_prepared_columns[described_prepared_columns['output'].str.strip() != ""]
described_prepared_columns['output'].apply(safe_json_loads)
# Apply the function to each row in the 'output' column
described_prepared_columns['extracted_content'] = described_prepared_columns['output'].apply(extract_content)
described_prepared_columns['extracted_content'] = described_prepared_columns.apply(lambda row: row['output'] if pd.isnull(row['extracted_content']) or row['extracted_content'] == "" else row['extracted_content'], axis=1)
described_prepared_columns['extracted_content'] = described_prepared_columns['extracted_content'].apply(lambda x: x[1:-1] if isinstance(x, str) and x.startswith('"') and x.endswith('"') else x)

described_prepared_columns = described_prepared_columns.drop(columns=['output'])
described_prepared_columns['definition'] = described_prepared_columns['extracted_content']
described_prepared_columns = described_prepared_columns.drop(columns=['extracted_content'])
described_prepared_columns.columns.values

# described_prepared_columns.to_csv('./data/prepared_columns_described_v200.csv', index=False)

In [36]:
# import pandas as pd
# import matplotlib.pyplot as plt
# from matplotlib.backends.backend_pdf import PdfPages
# import textwrap

# class StretchyPDF:
#     """
#     A class for generating a PDF document from a Pandas DataFrame. Each row of the DataFrame is converted to text and added as a page in the PDF document.
    
#     Attributes:
#         pdf_pages (PdfPages): A PdfPages object from matplotlib.backends.backend_pdf, used to save figures to a PDF file.
    
#     Methods:
#         estimate_fig_dimensions(text, char_width=0.1, line_height=0.2):
#             Estimate the dimensions of the figure based on the text content.
            
#         add_text_to_pdf(doc_text):
#             Create a matplotlib figure, add text to it, and save the figure to the PDF file.
            
#         create_document(row):
#             Convert a row from the DataFrame to a formatted text document.
            
#         create_pdf(data):
#             Iterate through rows of a Pandas DataFrame, convert each row to text, and add it to the PDF document.
            
#         close_pdf():
#             Close the PDF file after all pages have been added.

#     Example:
#         >>> df = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [30, 25], 'Occupation': ['Engineer', 'Doctor']})
#         >>> stretchy_pdf = StretchyPDF('output.pdf')
#         >>> stretchy_pdf.create_pdf(df)
#         >>> stretchy_pdf.close_pdf()
        
#     The above example will create a PDF file named 'output.pdf' with two pages, one for Alice and one for Bob, including their respective details.
#     """
#     def __init__(self, output_pdf):
#         self.pdf_pages = PdfPages(output_pdf)
        
#     @staticmethod
#     def estimate_fig_dimensions(text, char_width=0.1, line_height=0.2):
#         lines = text.split('\n')
#         max_line_length = max(len(line) for line in lines)
#         num_lines = len(lines)
#         fig_width = max_line_length * char_width
#         fig_height = num_lines * line_height
#         return fig_width, fig_height
    
#     def add_text_to_pdf(self, doc_text):
#         fig_width, fig_height = self.estimate_fig_dimensions(doc_text)
#         fig, ax = plt.subplots(figsize=(max(8.5, fig_width), max(11, fig_height)))
#         half_inch_in_points = 0.5 * fig.dpi
#         x_position = half_inch_in_points / fig.bbox.width
#         ax.text(x_position, 1.0, doc_text, fontsize=10, ha='left', va='top', transform=ax.transAxes, family='monospace')
#         ax.axis('off')
#         self.pdf_pages.savefig(fig)
#         plt.close(fig)
    
#     @staticmethod
#     def create_document(row):
#         document = []
#         for col_name, col_value in row.items():
#             if col_name == 'code':
#                 document.append(f"\n# {col_name}:\n{col_value}\n###")
#             else:
#                 commented_value = '\n'.join(f'{line}' for line in str(col_value).split('\n'))
#                 document.append(f"\n{col_name}:\n{commented_value}\n")
#         return '\n'.join(document)
    
#     def create_pdf(self, data):
#         for index, row in data.iterrows():
#             doc_text = self.create_document(row)
#             self.add_text_to_pdf(doc_text)
    
#     def close_pdf(self):
#         self.pdf_pages.close()


In [111]:
# stretchy_pdf = StretchyPDF('./data/prepared_columns.pdf')
# stretchy_pdf.create_pdf(described_prepared_columns)
# stretchy_pdf.close_pdf()