In [1]:
import pandas as pd, numpy as np, babel as bl, docx as docx
from docx import Document
from docx.text.hyperlink import Hyperlink
import re
import os

In [2]:

def process_docx(docx_path):
    filename = os.path.splitext(os.path.basename(docx_path))[0]
    parts = filename.split('--', 1)
    left_part = parts[0] if len(parts) > 0 else ''
    right_part = parts[1] if len(parts) > 1 else ''
    numbers = re.findall(r'\d+', left_part)
    number = numbers[-1] if numbers else ''
    title = f"{number}--{right_part}" if number else right_part

    gen_txt_dict = {title: {'E': [], 'M': [], 'H': []}}

    doc = Document(docx_path)

    current_prompt = None
    current_url = None
    current_essay = []
    current_level = None

    for para in doc.paragraphs:
        text = para.text.strip()
        if text.startswith('Generate'):
            if current_prompt is not None and current_url is not None and current_essay:
                essay_text = '\n\n'.join(current_essay).strip()
                if essay_text and current_level is not None:
                    gen_txt_dict[title][current_level].append({
                        'Prompt_Link': current_url,
                        'Prompt': current_prompt,
                        'Essay': essay_text
                    })
            current_prompt = text
            current_url = None
            current_essay = []
            if re.match('.*3rd.*|.*5th.*', current_prompt):
                current_level = 'E'
            elif re.match('.*6th.*|.*8th.*', current_prompt):
                current_level = 'M'
            elif re.match('.*9th.*|.*12th.*', current_prompt):
                current_level = 'H'
            else:
                current_level = None
        elif text.startswith("https://chatgpt.com/share"):
            current_url = text
        else:
            if current_prompt is not None and current_url is not None and text:
                current_essay.append(text)

    if current_prompt is not None and current_url is not None and current_essay:
        essay_text = '\n\n'.join(current_essay).strip()
        if essay_text and current_level is not None:
            gen_txt_dict[title][current_level].append({
                'Prompt_Link': current_url,
                'Prompt': current_prompt,
                'Essay': essay_text
            })

    return gen_txt_dict

def process_all_files(base_path, total_files=43):
    master_dict = {}

    for i in range(1, total_files + 1):
        filename = f"arc-text {i}--"
        dir_path = base_path
        matched_files = []
        for file in os.listdir(dir_path):
            if file.startswith(filename) and file.endswith(".docx"):
                matched_files.append(os.path.join(dir_path, file))
        
        if matched_files:
            docx_path = matched_files[0]
            file_data = process_docx(docx_path)
            master_dict.update(file_data)

    return master_dict

base_path = '/Users/brtelfer/Documents/Personal Stuff/Python_Data_Projects/GenAI Texts/Input Texts'
final_dict = process_all_files(base_path)
print(final_dict)



In [3]:
Key = {"1.1":"The Brain", 
         "1.2":"Community College", 
         "1.3":"Mount Everest", 
         "2.1":"Parts of the Human Brain", 
         "2.2":"What is Financial Literacy", 
         "2.3":"Spicy Food", 
         "3.1":"How Does Memory Work", 
         "3.2":"The Taxes We Pay", 
         "3.3":"The North Pole", 
         "4.1":"Our Senses and the Brain", 
         "4.2":"Soft Skills Part 1 Time Management and Professionalism", 
         "4.3":"The Aztecs", 
         "5.1":"How Do We Learn How to Speak", 
         "5.2":"Soft Skills Part 2 Teamwork and Conflict Resolution", 
         "5.3":"Where Did the Internet Come From", 
         "6.1":"Why Do We Sleep Why Do We Dream", 
         "6.2":"Career Pathways 1 How to Find Out About Jobs and Careers", 
         "6.3":"The Taj Mahal", 
         "7.1":"Personality", 
         "7.2":"Career Spotlight Counselor", 
         "7.3":"Should We Go to Outer Space", 
         "8.1":"Concussions", 
         "8.2":"Career Spotlight Trucker", 
         "8.3":"Smart Phones", 
         "9.1":"Human Brain Facts and Myths", 
         "9.2":"Career Spotlight The Trades", 
         "9.3":"Different Kinds of Art and Artists", 
         "10.1":"What Does it Mean to Learn Something", 
         "10.2":"Career Spotlight Registered Nurse and Other Healthcare Professionals", 
         "10.3":"What is Diabetes", 
         "11.1":"The Human Body Muscles and Bones", 
         "11.2":"Career Spotlight Computer Technology", 
         "11.3":"Wikipedia", 
         "12.1":"The Human Body Heart and Lungs", 
         "12.2":"Career Spotlight Office Manager", 
         "12.3":"Different Styles of Food", 
         "13.1":"The Human Body Lets Eat! The Digestive System", 
         "13.2":"Career Spotlight Restaurant Workers", 
         "13.3":"Sneakerheads", 
         "14.1":"The Human Body The Immune System", 
         "14.2":"Career Spotlight Be Your Own Boss", 
         "14.3":"The Grand Canyon"}

In [4]:
new_dict = {}
for key, value in final_dict.items():
    new_key = re.sub('\d+--', '', key)
    new_dict[new_key] = value

In [5]:
def swap_key_value(original_dict):
    return {value: key for key, value in Key.items()}
swapped_dict = swap_key_value(Key)

In [6]:
new_dict = sorted(new_dict.items())
swapped_dict = sorted(swapped_dict.items())
keyed_dict = {}
for val1, val2 in zip(new_dict, swapped_dict):
    if val1[0] not in keyed_dict:
        keyed_dict[val2[1]] = val1[1]

In [7]:
essay_by_diff = {}
for val1 in keyed_dict:
    if val1 not in essay_by_diff:
        for val2 in keyed_dict[val1]['E']:
            essay_by_diff[f'{val1}.1'] = val2['Essay']
        for val2 in keyed_dict[val1]['M']:
            essay_by_diff[f'{val1}.2'] = val2['Essay']
        for val2 in keyed_dict[val1]['H']:
            essay_by_diff[f'{val1}.3'] = val2['Essay']

In [8]:
import pandas as pd
texts = pd.DataFrame.from_dict(essay_by_diff, orient='index')

In [9]:
import pandas as pd
import os

def dataframe_to_txt_files(df, output_dir="/Users/brtelfer/Documents/Personal Stuff/Python_Data_Projects/GenAI Texts/Cleaned Input Texts"):
    """Export full content of DataFrame rows to individual text files"""
    os.makedirs(output_dir, exist_ok=True)
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.max_columns', None)
    for index, row in df.iterrows():
        filename = f"{index}.txt"
        filepath = os.path.join(output_dir, filename)
        content = "\n".join([str(row[col]) for col in df.columns])
        with open(filepath, "w", encoding='utf-8') as file:
            file.write(content)

dataframe_to_txt_files(texts)

In [10]:
import os

lst = os.listdir('/Users/brtelfer/Documents/Personal Stuff/Python_Data_Projects/GenAI Texts/Cleaned Input Texts') # your directory path
number_files = len(lst)
print(number_files)

117
