We can use flan t5 to generate clauses of the terms and conditions.

In [3]:
import os

DATADIR = 'data_all_202503130545112'

data_path = os.path.join(os.getcwd(), '../', DATADIR)


def load_text_file(file_path):
    """
    Load text from a file and return the content as a string.
    
    Args:
        file_path (str): The path to the text file.
        
    Returns:
        str: The content of the text file.
        
    Raises:
        FileNotFoundError: If the file does not exist.
        IOError: If there is an error reading the file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        raise
    except IOError as e:
        print(f"Error: An error occurred while reading the file at {file_path}.")
        raise e

def get_tc_files(website_name):#+
    """#+
    Retrieve the text content of terms and conditions files for a given website.#+
#+
    Args:#+
        website_name (str): The name of the website whose terms and conditions files are to be retrieved.#+
#+
    Returns:#+
        list of str: A list containing the text content of each terms and conditions file.#+
#+
    Raises:#+
        FileNotFoundError: If the documents directory or any TC_ file does not exist.#+
        IOError: If there is an error reading any of the TC_ files.#+
    """#+
    documents_path = os.path.join(data_path, website_name, 'documents')#+
    tc_files = [os.path.join(documents_path, file) for file in os.listdir(documents_path) if file.startswith('TC_') and file.endswith('.txt')]#+
    tc_texts = [load_text_file(tc_file) for tc_file in tc_files]#+
    return tc_texts#+


In [12]:
website_name = '23andMe'
sample_TC = get_tc_files(website_name)

In [14]:
print(f'{website_name} has {len(sample_TC)} TC files.')

for i, text in enumerate(sample_TC):
    print(f"Text {i+1} length: {len(text)} characters")

23andMe has 4 TC files.
Text 1 length: 16905 characters
Text 2 length: 24197 characters
Text 3 length: 60862 characters
Text 4 length: 38579 characters


In [15]:
# load the llama model from hugging face
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

# model Card for FLAN-T5 XXL

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xxl")



tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Tokenize the sample TC and show the tokenized length
tokenized_sample_TC = [tokenizer(text, return_tensors='pt') for text in sample_TC]

for i, tokenized_text in enumerate(tokenized_sample_TC):
    print(f"Tokenized Text {i+1} length: {tokenized_text['input_ids'].size(1)} tokens")


In [None]:
# Load model directly
from transformers import AutoImageProcessor, AutoModelForObjectDetection

processor = AutoImageProcessor.from_pretrained("LeeRuben/cppe5_use_data_finetuning")
model = AutoModelForObjectDetection.from_pretrained("LeeRuben/cppe5_use_data_finetuning")

In [None]:
# # Load model directly
# from transformers import AutoModelForCausalLM
# model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code=True)

