# Setup

In [4]:
import praw
import pandas as pd
from dotenv import load_dotenv
import os
pd.set_option('display.max_colwidth', 100)
import openai
import time

# Reddit Scraping on Posts

In [3]:
reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent="Eatmosphere_User_Research"
)

In [301]:
keywords = ['"relationship with food"',
            '"eating disorder"', 
            '"binge eating"',
            '"binge eat"',
            '"stress eating"',
            '"stress eat"'
            ]

submission_search_limit = 100
submission_score = 200
submission_comment_num = 20
comment_score = 100

In [302]:
data = []             # list of post dictionaries
unique_submissions = set()         # store unique post IDs to avoid duplicates

for keyword in keywords:
    for submission in reddit.subreddit("all").search(keyword, limit=submission_search_limit):
        if submission.is_self and submission.score > submission_score and submission.num_comments > submission_comment_num and submission.id not in unique_submissions:
            unique_submissions.add(submission.id)  # mark as seen
            data.append({
                "submission_title": submission.title,
                "body": submission.selftext,
                "submission_id": submission.id,
                "created_utc": submission.created_utc
            })

df_submission = pd.DataFrame(data)
df_submission["created_time"] = pd.to_datetime(df_submission["created_utc"], unit='s')
df_submission.drop(columns=["created_utc"], inplace=True)

In [303]:
print(f"Total unique submissions collected: {len(df_submission)}")
df_submission.head()

Total unique submissions collected: 143


Unnamed: 0,submission_title,body,submission_id,created_time
0,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...","The rule is that you can eat however much you want, but you can't be watching videos / scrolling...",iv8h0t,2020-09-18 15:42:57
1,My sister just said in conversation: “It’s disturbing. I’ve never met a woman who doesn’t have a...,Damn.\n\nThinking of the third graders I knew on diets. The women starving themselves for their ...,kjqocu,2020-12-25 01:25:09
2,"The “relationship with food” narrative is a scam, and we have been gaslit for years",I am so tired of hearing about “healing your relationship with food.” Food is not a person. Ther...,1ijttfp,2025-02-07 12:25:19
3,"My therapist said the #1 thing her ADHD clients seek help for is food. So, what’s your relations...",This blew my mind. It soo doesn’t get talked about enough. \n\nI joked with her that I have an e...,1c6u8e8,2024-04-18 04:10:11
4,[DISCUSSION] The United States has a strange relationship with food.,"Hello everyone! My name is Andrew, I'm a 20's-something game developer from the United States an...",drvyh0,2019-11-05 07:37:06


In [304]:
submission_ids = df_submission['submission_id'].unique().tolist()
data = []

for sid in submission_ids:
    submission = reddit.submission(id=sid)
    submission.comments.replace_more(limit=0)  # flatten all "MoreComments"
    
    for comment in submission.comments.list():
        if comment.score >= comment_score:  # filter by comment upvotes
            data.append({
                "submission_title": submission.title,
                'body': comment.body,
                'submission_id': sid,
                'comment_id': comment.id,
                "created_utc": comment.created_utc
            })


df_comment = pd.DataFrame(data)
df_comment["created_time"] = pd.to_datetime(df_comment["created_utc"], unit='s')
df_comment.drop(columns=["created_utc"], inplace=True)

In [305]:
print(f"Total unique comments collected: {len(df_comment)}")
df_comment.head()

Total unique comments collected: 2646


Unnamed: 0,submission_title,body,submission_id,comment_id,created_time
0,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...",Food was not always affordable for my Latino parents growing up. For this reason I was taught to...,iv8h0t,g5py817,2020-09-18 17:09:03
1,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...",Putting the fork down between bites is a really good tip to slow down and let your brain catch u...,iv8h0t,g5q0y26,2020-09-18 17:30:08
2,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...",Completely doesn't work for me. I've always had meals separate from computer time etc. My prob...,iv8h0t,g5pok6c,2020-09-18 15:58:02
3,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...",I think a big thing too is simply limit portions. Buying smaller bowls and plates helps. Also ...,iv8h0t,g5q89wt,2020-09-18 18:30:52
4,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...","My mom once told me that, when you take a deep breath during a meal, that's your body realizing ...",iv8h0t,g5pucto,2020-09-18 16:40:30


In [306]:
df_submission['comment_id'] = 'n/a'
df_submission = df_submission[df_comment.columns]
df = pd.concat([df_comment, df_submission], ignore_index=True)

In [307]:
df.head()

Unnamed: 0,submission_title,body,submission_id,comment_id,created_time
0,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...",Food was not always affordable for my Latino parents growing up. For this reason I was taught to...,iv8h0t,g5py817,2020-09-18 17:09:03
1,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...",Putting the fork down between bites is a really good tip to slow down and let your brain catch u...,iv8h0t,g5q0y26,2020-09-18 17:30:08
2,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...",Completely doesn't work for me. I've always had meals separate from computer time etc. My prob...,iv8h0t,g5pok6c,2020-09-18 15:58:02
3,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...",I think a big thing too is simply limit portions. Buying smaller bowls and plates helps. Also ...,iv8h0t,g5q89wt,2020-09-18 18:30:52
4,"LPT: If you want to stop overeating and improve your relationship with food, only eat in your di...","My mom once told me that, when you take a deep breath during a meal, that's your body realizing ...",iv8h0t,g5pucto,2020-09-18 16:40:30


In [308]:
filename = f"df_subLimit{submission_search_limit}_subScore{submission_score}_subComments{submission_comment_num}_comScore{comment_score}.csv"
df.to_csv(filename, index=False)

# Identifying Struggles, Causes of Struggles, Solutions, and Pain Points with LLM

## Parsing

In [5]:
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))  # replace with your key

In [312]:
input = pd.read_csv("df_subLimit100_subScore200_subComments20_comScore100.csv")
# input = input.sample(n=10)  # Randomly select n rows

In [313]:
prompt = """

You are a researcher analyzing Reddit posts about people's relationship with food.

For each post, extract the following insights using short, specific sentences only (no full/long sentences):

1. Struggle — What challenges does the person face in their relationship with food? (e.g., binge eating, food guilt, anorexia)
2. Cause of Struggle — What thoughts, emotions, or behaviors make them feel out of control, guilty, or anxious?
3. Solutions — What tools or strategies have they tried? (e.g., apps, communities, books, therapy)
4. Pain Points — What are the limitations or frustrations with those solutions?

Return the result in this exact format:

Struggle: struggle_1; struggle_2; ...
Cause of Struggle: cause_1; cause_2; ...
Solutions: solution_1; solution_2; ...
Pain Points: pain_1; pain_2; ...

If a section is not mentioned or not clear, return:
Struggle: none
Cause of Struggle: none
Solutions: none
Pain Points: none


"""

In [314]:
responses = []

for i, row in enumerate(input["body"]):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": f'"""{row}"""'}
            ]
        )
        txt = response.choices[0].message.content
        responses.append((i, txt))
    except Exception as e:
        print(f"Error at {i}: {e}")
        responses.append((i, ''))
        time.sleep(20)  # wait before retrying or continuing

In [315]:
responses

[(0,
  'Struggle: Overeating; Food waste avoidance\nCause of Struggle: Fear of wasting food; Feeling the need to finish everything; Justification to prevent food from going bad\nSolutions: Trying out new strategies; Seeking support and ideas from others\nPain Points: Mental obstacles; Feeling stuck in the behavior; Lack of knowledge on alternative approaches'),
 (1,
  'Struggle: none\nCause of Struggle: none\nSolutions: Putting the fork down between bites\nPain Points: none'),
 (2,
  'Struggle: Overeating; Lack of feeling fullness\nCause of Struggle: No sense of fullness until overeating; Difficulty recognizing portion sizes\nSolutions: Portion control; Monitoring food intake\nPain Points: Lack of internal cues to stop eating; Difficulty in self-regulating portion sizes'),
 (3,
  'Struggle: Overeating; Snacking after meals\nCause of Struggle: Lack of portion control; Snacking habit\nSolutions: Using smaller bowls and plates; Avoiding second helpings; Cleaning up after eating\nPain Poin

In [316]:
parsed = []
for _, entry in responses:
    lines = dict(l.split(": ", 1) for l in entry.split("\n") if ": " in l)
    parsed.append({
        "Struggle": lines.get("Struggle", "none"),
        "Cause of Struggle": lines.get("Cause of Struggle", "none"),
        "Solutions": lines.get("Solutions", "none"),
        "Pain Points": lines.get("Pain Points", "none")
    })

df_parsed_responses = pd.DataFrame(parsed)

In [317]:
df_parsed_responses

Unnamed: 0,Struggle,Cause of Struggle,Solutions,Pain Points
0,Overeating; Food waste avoidance,Fear of wasting food; Feeling the need to finish everything; Justification to prevent food from ...,Trying out new strategies; Seeking support and ideas from others,Mental obstacles; Feeling stuck in the behavior; Lack of knowledge on alternative approaches
1,none,none,Putting the fork down between bites,none
2,Overeating; Lack of feeling fullness,No sense of fullness until overeating; Difficulty recognizing portion sizes,Portion control; Monitoring food intake,Lack of internal cues to stop eating; Difficulty in self-regulating portion sizes
3,Overeating; Snacking after meals,Lack of portion control; Snacking habit,Using smaller bowls and plates; Avoiding second helpings; Cleaning up after eating,
4,none,none,none,none
...,...,...,...,...
2784,weight gain; self-image issues; lack of intimacy,stress eating/drinking; feeling unattractive; self-consciousness,self-reflection; communication with spouse,feeling unworthy; misunderstanding spouse's intentions; negative comments from others
2785,none,none,none,none
2786,weight fluctuations; stress eating; blood sugar control,stress from life changes; emotional eating in response to stress; societal pressure on body image,following a keto diet; increased physical activity; setting boundaries in social situations to c...,negative comments from friends about weight loss; feeling judged for dietary choices; struggle w...
2787,stress eating; depression; weight gain,grief from his mom's death; existential crisis,seeking professional therapy; encouraging healthier coping mechanisms,lack of communication and understanding between partners; difficulty in addressing the issue sen...


In [318]:
filename = f"parsed_output_2.csv"
df_parsed_responses.to_csv(filename, index=False)

## Summarizing Parsed Data

In [10]:
df_parsed_responses = pd.read_csv('parsed_output_2.csv')

In [11]:
# def simple_counts(column):
#     flat_str = ";".join(df_parsed_responses[column].dropna().astype(str))
#     list = [s.strip() for s in flat_str.split(";") if s.strip().lower() not in {"none", "", "unclear", "not applicable", "not specified", "not mentioned"}]
#     list = [s.strip().lower() for s in list]
#     df = pd.Series(list).value_counts().reset_index()
#     df.columns = [column, 'Count']
#     return df
# struggle_counter = simple_counts("Struggle")
# cause_counter = simple_counts("Cause of Struggle")
# solution_counter = simple_counts("Solutions")
# pain_counter = simple_counts("Pain Points")

In [12]:
def flatten(column):
    return [
        item.strip()
        for cell in df_parsed_responses[column]
        if cell.lower() != "none"
        for item in cell.split(";")
    ]

struggles = flatten("Struggle")
causes = flatten("Cause of Struggle")
solutions = flatten("Solutions")
pain_points = flatten("Pain Points")


In [13]:
def summarize_with_gpt(label, items):
    raw = "\n".join(items)
    prompt = f"""
    You are analyzing a list of '{label}'.

    Here are the raw items:
    {raw}

    Your task:
    - Group similar items under common themes
    - Count how many times each theme appears
    - Rank the themes from most to least common
    - Provide 1-2 example items for each theme

    Format:
    - [Theme] - [count]
        - Example: [item]
        - Example: [item]
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


In [14]:
output_struggles = summarize_with_gpt("struggles of people in their relationship with food", struggles)
output_solutions = summarize_with_gpt("solutions to the struggles of people in their relationship with food", solutions)
output_pain_points = summarize_with_gpt("pain points of the solutions to people's struggle in their relationship with food", pain_points)

In [15]:
import random
output_causes = summarize_with_gpt("causes of the people's struggles in their relationship with food", [item for item in causes if item not in random.sample(causes, 200)])

In [16]:
with open("summary_struggle.txt", "w", encoding="utf-8") as f:
    f.write(output_struggles)
with open("summary_causes.txt", "w", encoding="utf-8") as f:
    f.write(output_causes)
with open("summary_solutions.txt", "w", encoding="utf-8") as f:
    f.write(output_solutions)
with open("summary_pain_points.txt", "w", encoding="utf-8") as f:
    f.write(output_pain_points)