This notebook with call the necessary functions to pass a data frame to an LLM and have it make suggestions for the output:

In [1]:
#Library import for loading df
from llm_data_checker import read_df


#incase of error with file cache uncomment below command 
#importlib.reload(llm_data_checker)

data = read_df("data_test/Uncleaned_DS_jobs.csv")


#run  source .venv/bin/activate to activate virtual environment first
#data.head()

Run this cell to check basic stats of df:

In [None]:
from llm_data_checker import df_checker_v2


'''
# For the sake of transparency, this function was created by ChatGPT 
# Possible to include this but it may cause some privacy concerns if the structural patterns are too unique, 
# so for now I will leave this out of the main function and just run it here for transparency.
# May also cause token limit issues if there are many unique patterns, so will need to consider how to handle that if we want to include it in the future.

from llm_data_checker import get_structure_pattern

structural_patterns = {}
for col in data.columns:
    data[f"{col}_structure"] = data[col].apply(get_structure_pattern)
    structural_patterns[col] = data[f"{col}_structure"].unique().tolist()


print("Structural patterns per column:")
for col, patterns in structural_patterns.items():
    print(f"Column '{col}': {patterns}")
'''

check_data = df_checker_v2(data)



#write out the stats to external directory
with open("stats_output/stats.txt", "w") as f:
    for section_name, section_value in check_data.items():
        #write a header for this section
        f.write(f"===== {section_name} =====\n")
        
        #write the actual data
        f.write(str(section_value))
        
        #add spacing
        f.write("\n\n")

Structural patterns per column:
Column 'index': ['DIGIT(1)', 'DIGIT(2)', 'DIGIT(3)']
Column 'Job Title': ['ALPHA(2)-SPACE-ALPHA(4)-SPACE-ALPHA(9)', 'ALPHA(4)-SPACE-ALPHA(9)', "ALPHA(4)-SPACE-ALPHA(9)-SPACE-'/'-SPACE-ALPHA(7)-SPACE-ALPHA(8)-SPACE-ALPHA(6)", "ALPHA(5)-SPACE-ALPHA(4)-SPACE-ALPHA(9)-SPACE-'-'-SPACE-ALPHA(9)", "ALPHA(4)-SPACE-ALPHA(9)-SPACE-'-'-SPACE-ALPHA(10)-','-SPACE-ALPHA(5)-SPACE-ALPHA(6)", 'ALPHA(4)-SPACE-ALPHA(7)', 'ALPHA(11)-SPACE-ALPHA(4)-SPACE-ALPHA(9)', "ALPHA(4)-SPACE-ALPHA(9)-SPACE-'-'-SPACE-ALPHA(8)", 'ALPHA(4)-SPACE-ALPHA(7)-SPACE-ALPHA(2)', 'ALPHA(7)-SPACE-ALPHA(3)-SPACE-ALPHA(9)', "ALPHA(4)-SPACE-ALPHA(9)-'/'-ALPHA(7)-SPACE-ALPHA(8)", 'ALPHA(5)-SPACE-ALPHA(7)-SPACE-ALPHA(9)', "ALPHA(8)-SPACE-ALPHA(12)-SPACE-ALPHA(7)-SPACE-ALPHA(1)-'-'-SPACE-ALPHA(4)-SPACE-ALPHA(8)", "ALPHA(4)-SPACE-ALPHA(9)-SPACE-'-'-SPACE-ALPHA(4)", "ALPHA(4)-SPACE-ALPHA(9)-'-'-ALPHA(5)-SPACE-ALPHA(9)", "ALPHA(6)-SPACE-ALPHA(8)-SPACE-ALPHA(12)-'-'-SPACE-ALPHA(4)-SPACE-ALPHA(9)", 'ALPHA(4)-

Anonymiser for data exists within the df_checker function. It takes stats, shapes and patterns to inform the LLM for data cleaning suggestions. 

Prompt builder:

In [3]:
from pathlib import Path 

#point to directoy holding prompt files
framework_txt = Path("frameworks")

#points to the directory holding the df stats 
stats_txt = Path("stats_output" )

#read the actual files themselves
system_template = (framework_txt / "prompt.txt").read_text()


func_test_suite = (framework_txt / "func_test_suite.txt").read_text()
function_format = (framework_txt / "function_format.txt").read_text()
stats = (stats_txt / "stats.txt").read_text()
reasoning = (framework_txt / "reasoning.txt").read_text()
helper_reg = (framework_txt / "helper_reg.txt").read_text()


#build the actual prompt 
prompt = system_template.format(
    func_test_suite=func_test_suite,
    function_format=function_format,
    stats=stats,
    reasoning=reasoning,
    helper_reg=helper_reg
)

#write out the stats to external directory
with open("final_prompt/combined_prompt.txt", "w") as f:
    f.write(prompt)

combined_prompt = Path("final_prompt/combined_prompt.txt").read_text()
print(len(combined_prompt))

10554


API call out 


In [4]:
from cerebras.cloud.sdk import Cerebras
import os

client = Cerebras(api_key=os.environ["CEREBRAS_API_KEY"])

completion = client.chat.completions.create(
    model="zai-glm-4.7",
    messages=[
        {"role": "system", "content": combined_prompt},
        {
            "role": "user",
            "content": "proceed"
        },
    ],
    max_completion_tokens=8192,  #increase for complex analysis
    temperature=0.0,
    top_p=0.95,  #
    frequency_penalty=0.0,  #reduce repetition
    presence_penalty=0.0,   #encourages completeness
)

output = completion.choices[0].message.content

Write out results from LLM:

In [5]:
from pathlib import Path



out_path = Path("llm_cleaning/llm_output.txt")
out_path.write_text(output, encoding="utf-8")

16508

Chop .txt results from cleaning operation and feed results into LLM for suggestions:

In [6]:
from pathlib import Path

path = Path("llm_cleaning/llm_output.txt")
text = path.read_text(encoding="utf-8")

marker = "# === APPEND NEW TRANSFORM FUNCTIONS BELOW ==="

if marker in text:
    chopped = text.split(marker, 1)[1]  # keep everything AFTER
else:
    chopped = text  # fallback if marker not found

output_path = Path("temp/llm_output_chopped.txt")
output_path.write_text(chopped.strip(), encoding="utf-8")

16001

Read in results from LLM for suggestion generation if desired:

In [7]:
from cerebras.cloud.sdk import Cerebras
import os


#point to directoy holding prompt files
framework_txt = Path("frameworks")

#points to the directory holding the df stats 
stats_txt = Path("stats_output" )

#read the actual files themselves
system_template = (framework_txt / "suggestions_format.txt").read_text()

stats = (stats_txt / "stats.txt").read_text()
chopped_output = Path("temp/llm_output_chopped.txt").read_text(encoding="utf-8" )

#build the actual prompt 
prompt = system_template.format(
    stats=stats,
    llm_output_chopped=chopped_output
)

client = Cerebras(api_key=os.environ["CEREBRAS_API_KEY"])

completion = client.chat.completions.create(
    model="zai-glm-4.7",
    messages=[
        {"role": "system", "content": prompt},
        {
            "role": "user",
            "content": "proceed"
        },
    ],
    max_completion_tokens=8192,  #increase for complex analysis
    temperature=0.0,
    top_p=0.95,  #
    frequency_penalty=0.0,  #reduce repetition
    presence_penalty=0.0,   #encourages completeness
)

output = completion.choices[0].message.content


from pathlib import Path



out_path = Path("llm_suggestions/llm_suggestions.txt")
out_path.write_text(output, encoding="utf-8")



3265