# Thought process for generating the dataset using AI
I first decided it would be best if we just had AI generate a story in the form of journal entries. Once the writing had been complete, then we could begin the process of turning it into a dataset format.  
I found this to be a good way to go about it since machine learning models tend to have quite a few issues generating datasets. This also would allow us to create the most realistic possible dataset.  
After research, I found the optimal size to be about 2-3 paragraphs per entry with each paragraph being between 100-200 words. 


However, journal entries aren't neatly organized academic papers with well thought out points, correct grammar, and consistency. They're quite messy. The messiness of human writing was something I had to work.   

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

RAW_ENTRIES_PATH = DATA_DIR / "journal_entries_synthetic_150.json"

print("Using GPU:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))


Using GPU: True
GPU name: NVIDIA GeForce RTX 4070 Laptop GPU


In [7]:
df_entries = pd.read_json(RAW_ENTRIES_PATH)
print(df_entries.shape)
df_entries.head()

(150, 4)


Unnamed: 0,entry_id,timestamp,theme,full_text
0,1,2024-01-01,anger,"I got frustrated over something small today, b..."
1,2,2024-01-02,anxiety,My mind kept looping over tiny details that sh...
2,3,2024-01-03,contentment,I enjoyed a quiet moment today that made me fe...
3,4,2024-01-04,anxiety,"I tried grounding techniques, but the tension ..."
4,5,2024-01-05,self_reflection,I spent some time analyzing my reactions today...


In [9]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])  # keep it light
nlp.add_pipe("sentencizer")  # simple rule-based sentence splitter


<spacy.pipeline.sentencizer.Sentencizer at 0x2442772c8d0>

In [10]:
sent_records = []

for _, row in tqdm(df_entries.iterrows(), total=len(df_entries), desc="Splitting into sentences"):
    entry_id = row["entry_id"]
    timestamp = row.get("timestamp", None)
    text = str(row["full_text"])

    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    for idx, sent in enumerate(sentences):
        sent_records.append({
            "sentence_id": f"{entry_id}_{idx}",
            "entry_id": entry_id,
            "sentence_index": idx,
            "sentence_text": sent,
            "timestamp": timestamp,
        })

df_sentences = pd.DataFrame(sent_records)
print(df_sentences.shape)
df_sentences.head()


Splitting into sentences: 100%|██████████| 150/150 [00:00<00:00, 154.56it/s]

(376, 5)





Unnamed: 0,sentence_id,entry_id,sentence_index,sentence_text,timestamp
0,1_0,1,0,"I got frustrated over something small today, b...",2024-01-01
1,1_1,1,1,I hate when I lose control like that.,2024-01-01
2,2_0,2,0,My mind kept looping over tiny details that sh...,2024-01-02
3,2_1,2,1,It's exhausting trying to quiet the noise.,2024-01-02
4,2_2,2,2,There was this moment where everything felt un...,2024-01-02


In [11]:
def chunk_entry_sentences(sent_df, max_sentences=4, max_words=80):
    """
    Given all sentences for one entry (sorted by sentence_index),
    group them into chunks of up to max_sentences and max_words.
    """
    chunks = []
    current_chunk = []
    current_word_count = 0

    for _, row in sent_df.iterrows():
        sent = row["sentence_text"]
        words = sent.split()
        if not words:
            continue
        sent_len = len(words)

        # If adding this sentence would exceed limits, start a new chunk
        if current_chunk and (
            len(current_chunk) >= max_sentences or current_word_count + sent_len > max_words
        ):
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_word_count = 0

        current_chunk.append(sent)
        current_word_count += sent_len

    # Add last chunk if non-empty
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

chunk_records = []

for entry_id, group in tqdm(df_sentences.groupby("entry_id"), desc="Chunking sentences"):
    group = group.sort_values("sentence_index")
    chunks = chunk_entry_sentences(group, max_sentences=4, max_words=80)
    timestamp = group["timestamp"].iloc[0] if "timestamp" in group.columns else None

    for idx, chunk in enumerate(chunks):
        chunk_records.append({
            "chunk_id": f"{entry_id}_c{idx}",
            "entry_id": entry_id,
            "chunk_index": idx,
            "chunk_text": chunk,
            "timestamp": timestamp,
        })

df_chunks = pd.DataFrame(chunk_records)
print(df_chunks.shape)
df_chunks.head()


Chunking sentences: 100%|██████████| 150/150 [00:00<00:00, 1426.52it/s]

(150, 5)





Unnamed: 0,chunk_id,entry_id,chunk_index,chunk_text,timestamp
0,1_c0,1,0,"I got frustrated over something small today, b...",2024-01-01
1,2_c0,2,0,My mind kept looping over tiny details that sh...,2024-01-02
2,3_c0,3,0,I enjoyed a quiet moment today that made me fe...,2024-01-03
3,4_c0,4,0,"I tried grounding techniques, but the tension ...",2024-01-04
4,5_c0,5,0,I spent some time analyzing my reactions today...,2024-01-05


In [12]:
SENTENCES_PATH = DATA_DIR / "sentences.parquet"
CHUNKS_PATH = DATA_DIR / "chunks.parquet"

df_sentences.to_parquet(SENTENCES_PATH, index=False)
df_chunks.to_parquet(CHUNKS_PATH, index=False)

SENTENCES_PATH, CHUNKS_PATH


(WindowsPath('data/sentences.parquet'), WindowsPath('data/chunks.parquet'))

In [13]:
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Embedding device:", device)

embed_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)


  from .autonotebook import tqdm as notebook_tqdm



Embedding device: cuda


In [14]:
from math import ceil

sentence_texts = df_sentences["sentence_text"].astype(str).tolist()

batch_size = 64
num_batches = ceil(len(sentence_texts) / batch_size)
sentence_embs = []

for i in tqdm(range(num_batches), desc="Embedding sentences"):
    batch = sentence_texts[i*batch_size:(i+1)*batch_size]
    emb = embed_model.encode(
        batch,
        batch_size=len(batch),
        convert_to_numpy=True,
        show_progress_bar=False
    )
    sentence_embs.append(emb)

sentence_embs = np.vstack(sentence_embs)
sentence_embs.shape


Embedding sentences: 100%|██████████| 6/6 [00:01<00:00,  3.76it/s]


(376, 768)

In [15]:
chunk_texts = df_chunks["chunk_text"].astype(str).tolist()

num_batches = ceil(len(chunk_texts) / batch_size)
chunk_embs = []

for i in tqdm(range(num_batches), desc="Embedding chunks"):
    batch = chunk_texts[i*batch_size:(i+1)*batch_size]
    emb = embed_model.encode(
        batch,
        batch_size=len(batch),
        convert_to_numpy=True,
        show_progress_bar=False
    )
    chunk_embs.append(emb)

chunk_embs = np.vstack(chunk_embs)
chunk_embs.shape


Embedding chunks: 100%|██████████| 3/3 [00:00<00:00, 12.58it/s]


(150, 768)

In [16]:
SENT_EMB_PATH = DATA_DIR / "sentence_embeddings.npy"
CHUNK_EMB_PATH = DATA_DIR / "chunk_embeddings.npy"

np.save(SENT_EMB_PATH, sentence_embs)
np.save(CHUNK_EMB_PATH, chunk_embs)

SENT_EMB_PATH, CHUNK_EMB_PATH


(WindowsPath('data/sentence_embeddings.npy'),
 WindowsPath('data/chunk_embeddings.npy'))

In [17]:
print("Entries:", df_entries.shape[0])
print("Sentences:", df_sentences.shape[0])
print("Chunks:", df_chunks.shape[0])
print("Sentence emb shape:", sentence_embs.shape)
print("Chunk emb shape:", chunk_embs.shape)

# Check a random entry → its sentences → its chunks
sample_entry_id = df_entries["entry_id"].sample(1).iloc[0]
print("Sample entry_id:", sample_entry_id)

print("\nOriginal text:")
print(df_entries.loc[df_entries["entry_id"] == sample_entry_id, "full_text"].iloc[0])

print("\nSentences:")
display(df_sentences[df_sentences["entry_id"] == sample_entry_id][["sentence_index", "sentence_text"]])

print("\nChunks:")
display(df_chunks[df_chunks["entry_id"] == sample_entry_id][["chunk_index", "chunk_text"]])


Entries: 150
Sentences: 376
Chunks: 150
Sentence emb shape: (376, 768)
Chunk emb shape: (150, 768)
Sample entry_id: 8

Original text:
It felt like I was dragging myself through the day. Even small tasks seemed overwhelming. I felt really down today, like I was underwater emotionally.

Sentences:


Unnamed: 0,sentence_index,sentence_text
17,0,It felt like I was dragging myself through the...
18,1,Even small tasks seemed overwhelming.
19,2,"I felt really down today, like I was underwate..."



Chunks:


Unnamed: 0,chunk_index,chunk_text
7,0,It felt like I was dragging myself through the...
