In [None]:
# reload modules before executing a script
%load_ext autoreload
%autoreload 2

In [None]:
from loguru import logger
from typing import List
from parsers.nb_parser import NotebookParser
from analyze_nb_logs import load_object, get_all_file_with_extension_in_dir_recursively, logger, LogParser


notebooks_dir = 'data/tac_notebooks'
logs_dir = 'data/tac_raw_logs'
all_log_filepathes = get_all_file_with_extension_in_dir_recursively(logs_dir, ".log")
all_log_filepathes.sort()
# skip files containing baseline
all_log_filepathes = [log_filepath for log_filepath in all_log_filepathes if "baseline" not in log_filepath]
logger.success(f'There are {len(all_log_filepathes)} log files in {logs_dir} directory')

halt = False
for selected_log_filepath in all_log_filepathes:
    log_parser = LogParser(selected_log_filepath).parse()
    nb_sublog_dict = log_parser.attach_notebooks(notebooks_dir, verbose=False)
    # logger.debug(
    #     'Sample:' +\
    #     f'\nSelected log file: {selected_log_filepath}' +\
    #     f'\nfetching notebooks from log file: {notebooks_dir}' +\
    #     f'\nLog parser per these notebooks:\n{nb_sublog_dict.keys()}'
    # )

    from nb_progress import get_notebook_progress_using_log, InvalidLogError, NotebookStateLogMismatchError


    for i, (nb_filepath, (nb_log_parser, nb_parser)) in enumerate(nb_sublog_dict.items()):
        try:
            nb_progress = get_notebook_progress_using_log(nb_parser, nb_log_parser)
        except InvalidLogError as e:
            # logger.error(f'@ {i} Exception: {e} with nb_filepath({nb_parser.filepath}) and nb_log_parser({nb_log_parser.filepath})')
            continue
        except NotebookStateLogMismatchError as e:
            # logger.error(f'@ {i} Exception: {e} with nb_filepath({nb_parser.filepath}) and nb_log_parser({nb_log_parser.filepath})')
            continue


        nb_states: List[NotebookParser] = []
        for step_i, step in enumerate(nb_progress):
            step.reset()
            if len(step) == 0:
                nb_states.append(step.nb_parser_state)
            else:
                # prev_msgs = [] # TODO should I reset prev_msgs upon each completed step?
                for change_i, nb_parser_with_change_applied in enumerate(step):
                    nb_states.append(
                        nb_parser_with_change_applied
                    )

        num_progress_steps = len(nb_progress)


        # TODO DEBUGGING HERE
        if num_progress_steps >= 4:
            logger.info(f'Notebook: {nb_parser.filepath}')
            logger.info(f'Log: {nb_log_parser.filepath}')
            logger.info(f'Number of progress steps: {num_progress_steps}')
            halt = True
            break

    if halt:
        break


In [None]:
for i, nb_state in enumerate(nb_states):
    path = nb_state.to_notebook(directory='__nb_states', filepath_postfix=f'_{i}')
    logger.info(f'Notebook state saved to: {path}')

In [None]:
nb_parser.filepath, nb_log_parser.filepath

In [None]:
# from prompts.generate_questions_per_changes import _get_nb_states_updates
# _get_nb_states_updates(nb_states[0], nb_states[1])

In [None]:
from prompts.code_explain_change import (
    get_diff_nb_states,
)
diffs = []
for i in range(len(nb_states)):
    for j in range(i+1, len(nb_states)):
        print(f"Comparing {i} and {j}")
        print('='*80)
        diffs.append(get_diff_nb_states(nb_states[i], nb_states[j]))
        print(diffs[-1])
        print('='*80)

In [None]:
# import json
# from tqdm import tqdm
# from prompts.code_explain_change import (
#     get_diff_nb_states,
#     code_explain_change
# )

# changes_exps_dict = {}
# for i in range(len(nb_states)):
#     for j in range(i+1, len(nb_states)):
#         changes_exps_dict[(i, j)] = None

# with tqdm(total=len(changes_exps_dict), desc='Computing changes between states') as pbar:
#     for t1, t2 in list(changes_exps_dict.keys()):
#         pbar.set_postfix_str(f'{t1} -> {t2}')

#         changes_exps_dict[(t1, t2)] = code_explain_change(
#             nb_states[t1],
#             nb_states[t2]
#         )
#         print('='*100)
#         print(f'Changes between states {t1} and {t2}')
#         print(json.dumps(changes_exps_dict[(t1, t2)], indent=4))
#         print('='*100)

#         pbar.update(1)


In [None]:
# for (t1, t2), changes_exp in changes_exps_dict.items():
#     print('='*100)
#     print(f'Changes between {t1} and {t2}')
#     print(json.dumps(changes_exp, indent=4))
#     print('='*100)

In [None]:
from prompts.generate_questions_per_changes import make_questions_prompt

questions_dict = {}
for i in range(len(nb_states)):
    for j in range(i+1, len(nb_states)):
        questions_dict[(i, j)] = None


for (t1, t2) in questions_dict.keys():
    print('='*100)
    print(f'Question on changes between {t1} and {t2}')
    # print(json.dumps(changes_exp, indent=4))
    print('-'*100)
    questions_dict[(t1, t2)] = make_questions_prompt(
        nb_states[t1],
        nb_states[t2],
        # changes_exps_dict[(t1, t2)] # TODO try to use this as hints
    )
    for i, question in enumerate(questions_dict[(t1, t2)]):
        print(f'Q{i+1}: {question}')
    print('='*100)


In [None]:
from pprint import pprint
from prompts.answer_questions_per_change import answer_questions

rets_dict = {}
answers_dict = {}
for (t1, t2), questions in questions_dict.items():
    print('='*100)
    print(f'Answers between {t1} and {t2}')

    answers_dict[(t1, t2)], ret1, ret2 = answer_questions(
        nb_states[t1],
        nb_states[t2],
        questions,
        # changes_exps_dict[(t1, t2)], # TODO it is not used yet
    )
    rets_dict[(t1, t2)] = (ret1, ret2)
    for i, (question, answer) in enumerate(zip(questions, answers_dict[(t1, t2)])):
        print('-'*100)
        print(f'Q{i+1}:', question)
        print(f'A{i+1}:', answer)

    print('='*100)
    # break

In [None]:
import json
from prompts.generate_questions_per_changes import _get_nb_states_updates

for t1, t2 in rets_dict.keys():
    questions = questions_dict[(t1, t2)]
    answers = answers_dict[(t1, t2)]
    ret1, ret2 = rets_dict[(t1, t2)]
    print('='*100)
    print(f'Between {t1} and {t2}')
    # print('Notebook 1:')
    # print(nb_states[t1])
    print('Update:')
    print(json.dumps(_get_nb_states_updates(nb_states[t1], nb_states[t2]), indent=4))
    for i, (question, answer) in enumerate(zip(questions, answers)):
        print('>'*100)
        print(f'Q{i+1}:', question)
        print(f'A{i+1}:', answer)
        print('-'*100)
        print('>> context_t1')
        for doc_loader in ret1.invoke(question):
            code_snippet = eval(doc_loader.page_content)
            print('cell_type', code_snippet['cell_type'])
            print('id', code_snippet['id'])
            print('source:')
            print('\n'.join(code_snippet['source']))
            print('-'*100)

        print('-'*100)
        print('>> context_t2')
        for doc_loader in ret2.invoke(question):
            code_snippet = eval(doc_loader.page_content)
            print('cell_type', code_snippet['cell_type'])
            print('id', code_snippet['id'])
            print('source:')
            print('\n'.join(code_snippet['source']))
            print('-'*100)

        print('<'*100)



In [None]:
# def write_qa_table_to_file(questions_dict, answers_dict, nb_states):
#     from typing import List, Tuple
#     from parsers.nb_parser import CellEntry

#     from tabulate import tabulate
#     from utils import prettify_str
#     TEXT_WIDTH = 30


#     content_table = [
#         ['t1', 't2', 'q_a_table', 'diffs']
#     ]
#     for (t1, t2), questions in questions_dict.items():
#         # print('='*100)
#         # print(f'Questions and Answers between {t1} and {t2}')
#         diffs: List[Tuple[CellEntry]] = get_diff_nb_states(nb_states[t1], nb_states[t2])
#         diffs = [
#             [cell_t1.tabulate(text_width=TEXT_WIDTH), cell_t2.tabulate(text_width=TEXT_WIDTH)]
#             for cell_t1, cell_t2 in diffs
#         ]
#         diffs = tabulate([[table] for table in [
#             tabulate([diff], tablefmt='fancy_grid', headers=['t1', 't2'])
#             for diff in diffs
#         ]], tablefmt='fancy_grid')
#         # print(diffs)
#         questions = [
#             question[2:].strip()
#             for question in questions.split('\n')
#         ]
#         tabulated_question_answer = []
#         for i, (question, answer) in enumerate(zip(questions, answers_dict[(t1, t2)])):
#             # print('-'*100)
#             # print(f'Q{i}:')
#             question = prettify_str(question, TEXT_WIDTH)
#             # print(question)
#             # print(f'A{i}:')
#             answer = prettify_str(answer, TEXT_WIDTH)
#             # print(answer)

#             tabulated_question_answer.append([f'Q{i}', question, f'A{i}', answer])

#         q_a_table = tabulate(tabulated_question_answer, tablefmt='fancy_grid', headers=['Q', 'Question', 'A', 'Answer'])

#         content_table.append([t1, t2, q_a_table, diffs])

#         # print('='*100)

#     content_tabulated = tabulate(content_table, tablefmt='fancy_grid', headers='firstrow')
#     # print(content_tabulated)
#     # write to file

#     nb_parser_name = nb_parser.filepath.split('/')[-1]
#     nb_log_parser_name = nb_log_parser.filepath.split('/')[-1]
#     filepath_out = f'{nb_parser_name}_{nb_log_parser_name}_questions_answers.table'
#     with open(filepath_out, 'w') as f:
#         f.write(content_tabulated)

# write_qa_table_to_file(questions_dict, answers_dict, nb_states)
