# Message generation


## Setup

In [42]:
# Define constants
INPUT_FILE = "../data/sentiment_results.csv"
USER_TOPIC_OUTPUT_FILE = "../data/user_topic.csv"
TOPICS_OUTPUT_FILE = "../data/topics.csv"
OUTPUT_FILE = "../data/generated_messages.csv"

# Define trial context, this is used to generate template messages by the LLM
TRIAL_CONTEXT = """Chronic Pain Management Study: This clinical trial investigates an innovative non-pharmacological treatment combining targeted neurostimulation with cognitive behavioral therapy for chronic pain management. The study aims to evaluate the effectiveness of this integrated approach in reducing pain intensity and improving quality of life for participants with various chronic pain conditions. The treatment protocol involves weekly sessions over a 12-week period, with follow-up assessments at 3 and 6 months. Participants will have access to state-of-the-art facilities and experienced medical professionals throughout the study. The trial has been approved by the Institutional Review Board and follows all ethical guidelines for human subject research."""
N_TOPICS = 5    # number of topics to use for topic modeling, this is also the number of template messages to generate


In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from openai import OpenAI
import json

# Download NLTK data files (run this once)
nltk.download('stopwords')
nltk.download('wordnet')

# Set OpenAI API key
# find the api key in ../config/credentials.json
with open('../config/credentials.json', 'r') as f:
    credentials = json.load(f)

client = OpenAI(api_key=credentials['openai']['api_key'])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charliesun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charliesun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load data and preprocess

In [6]:
# Load data from '../data/sentiment_results.txt'
data = pd.read_csv(INPUT_FILE)

data.head()

Unnamed: 0,user,text,sentiment_label,sentiment_score
0,jarchivistswag,"I’ve had chronic nerve pain and severe, bilate...",Negative,0.961714
1,hotheadnchickn,"Hey, I am so sorry you are dealing with this. ...",Negative,0.697534
2,stradamus,"Yeah it’s always frustrating, I get the commen...",Negative,0.58759
3,manicpixietrainwreck,I’m applying the the UDN (Undiagnosed ) progra...,Neutral,0.500012
4,,[deleted],Neutral,0.652426


In [11]:
# Take only the Neutral and Positive rows
data = data[data['sentiment_label'].isin(['Neutral', 'Positive'])]
# Filter out rows where 'text' is '[deleted]' or 'text' is NaN or 'user' is NaN or 'user' is '[deleted]'
data = data[~data['text'].isin(['[deleted]'])]
data = data[~data['text'].isna()]
data = data[~data['user'].isin(['[deleted]'])]
data = data[~data['user'].isna()]

data.reset_index(drop=True, inplace=True)

data.head()

Unnamed: 0,user,text,sentiment_label,sentiment_score
0,manicpixietrainwreck,I’m applying the the UDN (Undiagnosed ) progra...,Neutral,0.500012
1,Clawhands2022,You can also look at prestigious universities ...,Neutral,0.648384
2,TheIdealHominidae,"Hi,\n\nMany of you suffer from the lack of dia...",Neutral,0.595388
3,Bbkingml13,Unfortunately I don’t think any insurance comp...,Positive,0.41657
4,Foxxinsocks,My doctor just informed me of this resource. I...,Positive,0.932868


In [15]:
# Preprocess the text data
def preprocess_text(text):
    if not isinstance(text, str):
        # Handle non-string inputs (e.g., NaN, None)
        return ''
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word not in stopwords.words('english')
    ]
    return ' '.join(tokens)

# Apply the preprocessing function
data['clean_text'] = data['text'].apply(preprocess_text)

data.head()

Unnamed: 0,user,text,sentiment_label,sentiment_score,clean_text
0,manicpixietrainwreck,I’m applying the the UDN (Undiagnosed ) progra...,Neutral,0.500012,im applying udn undiagnosed program clinical t...
1,Clawhands2022,You can also look at prestigious universities ...,Neutral,0.648384,also look prestigious university area clinical...
2,TheIdealHominidae,"Hi,\n\nMany of you suffer from the lack of dia...",Neutral,0.595388,hi many suffer lack diagnosis sometimes lack t...
3,Bbkingml13,Unfortunately I don’t think any insurance comp...,Positive,0.41657,unfortunately dont think insurance company fla...
4,Foxxinsocks,My doctor just informed me of this resource. I...,Positive,0.932868,doctor informed resource thought id share guy ...


## Perform topic modeling to group users into distinct segments

In [19]:
# Perform topic modeling to group users into distinct segments
# Create a document-term matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
dtm = vectorizer.fit_transform(data['clean_text'])

# Apply Latent Dirichlet Allocation
lda = LatentDirichletAllocation(
    n_components=N_TOPICS,  # Adjust the number of topics as needed
    random_state=42
)
lda.fit(dtm)

In [20]:
# Assign topics to users
topic_values = lda.transform(dtm)
data['topic'] = topic_values.argmax(axis=1)

data.head()

Unnamed: 0,user,text,sentiment_label,sentiment_score,clean_text,topic
0,manicpixietrainwreck,I’m applying the the UDN (Undiagnosed ) progra...,Neutral,0.500012,im applying udn undiagnosed program clinical t...,4
1,Clawhands2022,You can also look at prestigious universities ...,Neutral,0.648384,also look prestigious university area clinical...,4
2,TheIdealHominidae,"Hi,\n\nMany of you suffer from the lack of dia...",Neutral,0.595388,hi many suffer lack diagnosis sometimes lack t...,3
3,Bbkingml13,Unfortunately I don’t think any insurance comp...,Positive,0.41657,unfortunately dont think insurance company fla...,0
4,Foxxinsocks,My doctor just informed me of this resource. I...,Positive,0.932868,doctor informed resource thought id share guy ...,4


In [21]:
# Extract top words for each topic
def get_topic_words(model, feature_names, n_top_words):
    topics = {}
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[-n_top_words:]]
        topics[idx] = top_words
    return topics

n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
topic_words = get_topic_words(lda, feature_names, n_top_words)

# Display top words for each topic
for topic_num, words in topic_words.items():
    print(f"Topic {topic_num}: {', '.join(words)}")

Topic 0: could, teen, depression, dysphoria, would, wound, treatment, scale, study, gender
Topic 1: ive, specialty, already, negative, insurance, company, would, im, drug, research
Topic 2: trial, back, new, time, get, good, therapy, much, mg, pain
Topic 3: case, high, treatment, study, use, many, patient, good, one, disease
Topic 4: like, time, get, patient, research, help, im, pain, clinical, trial


In [36]:
# save user and topic to csv
data[['user', 'topic']].to_csv(USER_TOPIC_OUTPUT_FILE, index=False)

# save topic words to csv with headers "topic" and "word1, word2, ..."
topic_words_df = pd.DataFrame(topic_words).T  # Transpose to get topics as rows
topic_words_df.columns = [f'word{i+1}' for i in range(n_top_words)]
topic_words_df.index.name = 'topic'
topic_words_df.to_csv(TOPICS_OUTPUT_FILE)


## Generate template messages for each topic

In [44]:
def generate_invitation_message(top_words):
    prompt = f"""
You are a professional assistant tasked with writing a personalized invitation message for a clinical trial.
Use this trial context to inform your message:
{TRIAL_CONTEXT}

Write a concise invitation message (max 150 words) that:
1. Relates to these keywords from the participant's discussion history: {', '.join(top_words)}
2. Highlights relevant aspects of the trial that would appeal to someone interested in these topics
3. Maintains a professional yet warm tone
4. Includes a clear call to action

Use these placeholders where appropriate:
- {{PARTICIPANT_NAME}} - for the participant's name
- {{TRIAL_NAME}} - for the specific clinical trial name
- {{INSTITUTION}} - for the research institution's name
- {{CONTACT_INFO}} - for contact information
- {{LOCATION}} - for the trial location
- {{START_DATE}} - for when the trial begins
"""
    
    response = client.chat.completions.create(
        model="gpt-4o",  # Changed to GPT-4o for better quality
        messages=[
            {"role": "system", "content": "You are a medical research coordinator writing personalized invitations for clinical trial participants."},
            {"role": "user", "content": prompt.strip()}
        ],
        max_tokens=200,
        temperature=0.7,
        n=1
    )
    
    message = response.choices[0].message.content.strip()
    return message

# Generate messages for each topic
template_messages = {}
for topic_num, words in topic_words.items():
    message = generate_invitation_message(words)
    template_messages[topic_num] = message
    print(f"Generated message for Topic {topic_num}:\n{message}\n")

Generated message for Topic 0:
Subject: Invitation to Participate in a Pioneering Chronic Pain Management Study

Dear {PARTICIPANT_NAME},

We are excited to invite you to participate in our {TRIAL_NAME} at {INSTITUTION}, starting on {START_DATE}. This study explores an innovative treatment that combines targeted neurostimulation with cognitive behavioral therapy, potentially offering relief from chronic pain and associated conditions such as depression and gender dysphoria.

As someone interested in cutting-edge treatments, this trial presents a unique opportunity to participate in a study designed to improve pain intensity and quality of life. Our expert team will guide you through weekly sessions over 12 weeks, with follow-up evaluations to ensure comprehensive care.

Join us at {LOCATION} and contribute to groundbreaking research that could redefine chronic pain management. To learn more or enroll, please contact us at {CONTACT_INFO}.

We look forward to the possibility of your part

In [45]:
# save the template messages to topics output csv
topic_words_df['template_message'] = pd.Series(template_messages)
topic_words_df.to_csv(TOPICS_OUTPUT_FILE)

## Sample usage: generate a message for each user

In [46]:
# Simply plug in the user name and other info into the template message

# Create user-message pairs
user_messages = data[['user', 'topic']].copy()
user_messages['template_message'] = user_messages['topic'].map(template_messages)

In [47]:
# Mock data for template placeholders
mock_trial_data = {
    'TRIAL_NAME': 'Chronic Pain Management Study',
    'INSTITUTION': 'Medical Research Institute',
    'CONTACT_INFO': 'research@mri.org | (555) 123-4567',
    'LOCATION': 'New York City, NY',
    'START_DATE': 'March 1, 2024'
}

# Function to fill template with user-specific data
def fill_template(row):
    message = row['template_message']
    # Replace user-specific placeholder
    message = message.replace('{PARTICIPANT_NAME}', row['user'])
    
    # Replace other placeholders with mock data
    for key, value in mock_trial_data.items():
        message = message.replace(f'{{{key}}}', value)
    
    return message

# Fill in templates for each user
user_messages['final_message'] = user_messages.apply(fill_template, axis=1)

# Save to CSV
user_messages[['user', 'final_message']].to_csv(OUTPUT_FILE, index=False)

# Display first few messages to verify
print("Sample of generated messages:")
for _, row in user_messages.head(2).iterrows():
    print(f"\nFor user {row['user']}:")
    print(row['final_message'])

Sample of generated messages:

For user manicpixietrainwreck:
Subject: Invitation to Join the Chronic Pain Management Study at Medical Research Institute

Dear manicpixietrainwreck,

We're excited to invite you to participate in our Chronic Pain Management Study. This groundbreaking trial offers a unique opportunity to help advance research in chronic pain management. Combining targeted neurostimulation with cognitive behavioral therapy, our study is designed to reduce pain intensity and enhance your quality of life.

As someone who values the time and effort put into patient-centered research, we believe this trial could be a great fit for you. Our 12-week program includes weekly sessions, with follow-ups at 3 and 6 months, ensuring personalized support from our experienced medical team. You'll have access to state-of-the-art facilities at New York City, NY.

We hope you'll consider joining us in this important endeavor. To learn more or to enroll, please contact us at research@mri.or