# Read in the datasets

In [202]:
import pandas as pd

# Load NLP Mental Health Conversations
nlp_convos_df = pd.read_csv(
    "../datasets/custom_corpus/chatbot_sets/mental_health_sets/NLP_mental_health_convos/nlp_convos.csv"
)

# Load Mental Chat 16k (Interview 6k subset)
interview_df = pd.read_csv(
    "../datasets/custom_corpus/chatbot_sets/mental_health_sets/mental_chat16k/interview_data_6k.csv"
)

# Load counsel_chat (clean version)
counsel_chat_df = pd.read_csv(
    "../datasets/custom_corpus/chatbot_sets/mental_health_sets/counsel_chat/counsel_chat-data.csv"
)

print(f"NLP Mental Health Convos: {nlp_convos_df.shape}")
print(f"Interview 6k: {interview_df.shape}")
print(f"Counsel Chat: {counsel_chat_df.shape}")


NLP Mental Health Convos: (3512, 2)
Interview 6k: (6310, 3)
Counsel Chat: (1482, 9)


## Explore NLP Mental Health Convos

### Intro to NLP Convos

In [203]:
display(nlp_convos_df.head())
display(nlp_convos_df.tail())

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


Unnamed: 0,Context,Response
3507,My grandson's step-mother sends him to school ...,Absolutely not! It is never in a child's best ...
3508,My boyfriend is in recovery from drug addictio...,I'm sorry you have tension between you and you...
3509,The birth mother attempted suicide several tim...,"The true answer is, ""no one can really say wit..."
3510,I think adult life is making him depressed and...,How do you help yourself to believe you requir...
3511,I just took a job that requires me to travel f...,hmm this is a tough one!


In [204]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3512 non-null   object
 1   Response  3508 non-null   object
dtypes: object(2)
memory usage: 55.0+ KB


In [205]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,3512,3508
unique,995,2471
top,I have so many issues to address. I have a his...,Counseling ends when the client has received t...
freq,94,3


## Cleaning NLP Convos

### Remove rows with missing values

In [206]:
# Remove rows where 'Response' is null or empty
nlp_convos_df = nlp_convos_df.dropna(subset=["Response"])
nlp_convos_df = nlp_convos_df[nlp_convos_df["Response"].str.strip().astype(bool)]


In [207]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3508 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3508 non-null   object
 1   Response  3508 non-null   object
dtypes: object(2)
memory usage: 82.2+ KB


In [208]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,3508,3508
unique,995,2471
top,I have so many issues to address. I have a his...,Counseling ends when the client has received t...
freq,94,3


### Remove Complete Duplicates (rows where both Context AND Response are repeats)

In [209]:
# Remove exact duplicate prompt/response pairs
nlp_convos_df = nlp_convos_df.drop_duplicates(subset=["Context", "Response"])


In [210]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2740 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   2740 non-null   object
 1   Response  2740 non-null   object
dtypes: object(2)
memory usage: 64.2+ KB


In [211]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,2740,2740
unique,995,2471
top,I have so many issues to address. I have a his...,I'm sorry to hear that your brother has been h...
freq,47,2


## Final NLP Convo set

In [212]:
nlp_convos_df = nlp_convos_df.reset_index(drop=True)

print(f"NLP Mental Health Convos: {nlp_convos_df.shape}")


NLP Mental Health Convos: (2740, 2)


In [213]:
nlp_convos_df.sample(10)

Unnamed: 0,Context,Response
1577,We’ve been together almost three years. We arg...,"You can't fix this by yourself, but I applaud ..."
1959,What are some difficulties that a counselor ca...,"Each counselor will have their own list of ""di..."
1934,"Is it normal for people to cry during therapy,...",Crying during a therapy session is not unusual...
669,I am in my 30s with 2 kids and no job and I'm ...,I would want to know where do you put your nee...
1466,"I'm always listening to my husband, but it fee...",Does your husband want to listen to you?Find t...
2297,"I was the one who ended it, and I'm so glad I ...",EMDR therapy has shown great results for work ...
1600,"Ever since I was little, I loved the idea of l...",Attention is often something that is both want...
1207,I'm feeling rejected and frustrated. This is n...,"Every area in a relationship, including intima..."
1974,Does counseling really do anything that can he...,I love this question! Some people shy away fro...
1128,She's busy because her mom makes her clean all...,Maybe your girlfriend feels torn in her emotio...


## Explore Interview 6K

## Intro to Interview 6k

In [214]:
display(interview_df.head())
display(interview_df.tail())

Unnamed: 0,instruction,input,output
0,You are a helpful mental health counselling as...,I've been struggling with my mental health for...,I understand that you've been dealing with a s...
1,You are a helpful mental health counselling as...,I've been feeling overwhelmed with my caregivi...,"Your situation is complex, and it's important ..."
2,You are a helpful mental health counselling as...,I've been feeling constantly anxious and unabl...,I can see that you're dealing with a great dea...
3,You are a helpful mental health counselling as...,"My mom has Alzheimer's, and I've been her prim...",I'm sorry to hear that your siblings' demands ...
4,You are a helpful mental health counselling as...,"I've tried setting boundaries, but it feels li...","Your concerns are valid, and it's crucial to p..."


Unnamed: 0,instruction,input,output
6305,You are a helpful mental health counselling as...,I'm feeling overwhelmed and guilty about my un...,It's understandable that you're feeling a rang...
6306,You are a helpful mental health counselling as...,"I've been feeling overwhelmed lately, and I've...",I understand that you've been feeling overwhel...
6307,You are a helpful mental health counselling as...,"I understand what you're saying, but I can't s...",Your feelings of guilt and neglect are underst...
6308,You are a helpful mental health counselling as...,I've been thinking about my relationship with ...,It's important to acknowledge the emotional la...
6309,You are a helpful mental health counselling as...,I've been feeling stressed out lately because ...,Your words paint a vivid picture of the comple...


In [215]:
interview_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6310 entries, 0 to 6309
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  6310 non-null   object
 1   input        6310 non-null   object
 2   output       6310 non-null   object
dtypes: object(3)
memory usage: 148.0+ KB


In [216]:
interview_df.describe()

Unnamed: 0,instruction,input,output
count,6310,6310,6310
unique,1,6111,6210
top,You are a helpful mental health counselling as...,I've been dealing with a lot of logistical tas...,It's essential to recognize that caring for a ...
freq,6310,16,9


## Cleaning Interview Data

In [217]:
# Remove "Instruction" column
interview_df = interview_df.drop(columns=['instruction'])

# Remove exact duplicate prompt/response pairs
interview_df = interview_df.drop_duplicates(subset=['input', 'output'])



In [218]:
interview_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6233 entries, 0 to 6309
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   6233 non-null   object
 1   output  6233 non-null   object
dtypes: object(2)
memory usage: 146.1+ KB


In [219]:
interview_df.describe()


Unnamed: 0,input,output
count,6233,6233
unique,6111,6210
top,I've been feeling hopeless and suicidal lately...,Your concern for your mother's wellbeing is a ...
freq,8,6


## Final Interview 6k

In [220]:
interview_df = interview_df.reset_index(drop=True)
print(f"Interview 6k: {interview_df.shape}")


Interview 6k: (6233, 2)


In [221]:
interview_df.sample(10)

Unnamed: 0,input,output
3681,"I've been feeling quite isolated lately, as I'...",Your connection with [Name] is a beacon of hop...
2922,I've been struggling with the idea of starting...,Your concerns about the potential impact of st...
2593,"I'm worried about my wife, [Name]. She's not e...","Your concern for your wife, [Name], and her ea..."
3071,I've been feeling a deep sense of sadness and ...,"I hear you, and I want to validate your feelin..."
314,I've been feeling increasingly isolated and di...,It's understandable that you've been feeling i...
1,I've been feeling overwhelmed with my caregivi...,"Your situation is complex, and it's important ..."
5350,I've been given a list of lawyers to contact r...,"Your feelings are understandable, and it's ess..."
6144,I've been dealing with feelings of sadness and...,"I'm here to support you, and I want you to kno..."
4688,I've been putting off visiting my mother-in-la...,"Your concerns are valid, and it's natural to f..."
4022,I've never really thought about it that way. I...,I'm glad to hear that you're open to exploring...


### Explore Counsel Chat

### Intro to Counsel Chat

In [222]:
display(counsel_chat_df.head())
display(counsel_chat_df.tail())

Unnamed: 0,questionID,questionTitle,questionText,questionUrl,topics,therapistName,therapistUrl,answerText,upvotes
0,5566fab2a64752d71ec3ca69,Escalating disagreements between mother and wife,My wife and mother are having tense disagreeme...,https://counselchat.com/questions/escalating-d...,Family Conflict,"Kristi King-Morgan, LMSW",https://counselchat.com/therapists/kristi-king...,<p>What you are describing is something psycho...,0
1,5566f94fa64752d71ec3ca64,I'm addicted to smoking. How can I stop?,"I'm planning to have baby, so I have to quit s...",https://counselchat.com/questions/i-m-addicted...,"Substance Abuse,Addiction",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi. Good for you in planning ahead to do wh...,0
2,5567d26887a1cc0c3f3d8f46,Keeping secrets from my family,"I have secrets in my mind, and I don't know wh...",https://counselchat.com/questions/keeping-secr...,Family Conflict,Jeevna Bajaj,https://counselchat.com/therapists/jeevna-bajaj,<p>It sounds like keeping the secrets has beco...,0
3,556bed15c969ba5861709df5,The Underlying Causes of Being Possessive,I am extremely possessive in my relationships ...,https://counselchat.com/questions/the-underlyi...,"Behavioral Change,Social Relationships",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi there. It's great you are able to realiz...,0
4,556ba115c969ba5861709de6,Can I control anxiety without medication?,I had a head injury a few years ago and my min...,https://counselchat.com/questions/can-i-contro...,Anxiety,Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>You didn't say what or how many medications...,0


Unnamed: 0,questionID,questionTitle,questionText,questionUrl,topics,therapistName,therapistUrl,answerText,upvotes
1477,56d2f2aa9471b0b41ec68e4d,Is it healthy to embarrass a child as punishment?,My grandson's step-mother sends him to school ...,https://counselchat.com/questions/is-it-health...,"Parenting,Family Conflict",Candice Lawhorn,https://counselchat.com/therapists/candice-law...,<p>Absolutely not!&nbsp;</p><p>It is never in ...,0
1478,5797a411b43cd7825e26e246,How do I fix my relationship?,My boyfriend is in recovery from drug addictio...,https://counselchat.com/questions/how-do-i-fix...,"Relationships,Addiction","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,<p>I'm sorry you have tension between you and ...,0
1479,5796a111bc069dff6a5339ca,What are the long term effects of losing one's...,The birth mother attempted suicide several tim...,https://counselchat.com/questions/what-are-the...,"Family Conflict,Parenting,Children & Adolescents","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,"<p>The true answer is, ""no one can really say ...",0
1480,5795952cbc069dff6a5339aa,How do I help my 20 year old boyfriend who say...,I think adult life is making him depressed and...,https://counselchat.com/questions/how-do-i-hel...,"Relationships,Depression,Substance Abuse","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,<p>How do you help yourself to believe you req...,0
1481,5773e438b9ff751f196e8df0,I'm worried about my new job.,I just took a job that requires me to travel f...,https://counselchat.com/questions/i-m-worried-...,"Anxiety,Career Counseling",Philip Kolba,https://counselchat.com/therapists/philip-kolba,<p>hmm this is a tough one!</p>,0


In [223]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   questionID     1482 non-null   object
 1   questionTitle  1480 non-null   object
 2   questionText   1383 non-null   object
 3   questionUrl    1482 non-null   object
 4   topics         1472 non-null   object
 5   therapistName  1482 non-null   object
 6   therapistUrl   1482 non-null   object
 7   answerText     1482 non-null   object
 8   upvotes        1482 non-null   int64 
dtypes: int64(1), object(8)
memory usage: 104.3+ KB


In [224]:
counsel_chat_df.describe()

Unnamed: 0,upvotes
count,1482.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


### Cleaning Counsel Chat Data

In [225]:
counsel_chat_df = counsel_chat_df.drop(columns=['questionID', 'questionTitle', 'questionUrl', 'topics', 'therapistName', 'therapistUrl', 'upvotes'])


In [226]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1383 non-null   object
 1   answerText    1482 non-null   object
dtypes: object(2)
memory usage: 23.3+ KB


In [227]:
(counsel_chat_df.describe())

Unnamed: 0,questionText,answerText
count,1383,1482
unique,718,1478
top,I've gone to a couple therapy sessions so far ...,<p><br></p>
freq,22,5


In [228]:
display(counsel_chat_df.sample(10))

Unnamed: 0,questionText,answerText
2,"I have secrets in my mind, and I don't know wh...",<p>It sounds like keeping the secrets has beco...
915,I've been with a man for four years. For the l...,<p>That does sound very confusing...and hurtfu...
1462,My dad makes me feel like shit and like I'm wo...,<p>If you're actually able to afford to move o...
59,"About 3 years ago or so I was skinny, but I wa...",<p>Hey! I am so impressed with your efforts t...
1443,My fiancé and I come from a strong Christian b...,<p>You are right that his insecurities are at ...
1054,"My girlfriend was abused as a child. Now, if I...",Thank you for your question. &nbsp;I think it'...
699,I believe my partner has a masturbation and po...,<p>There is a lot of information out there rig...
1467,I am in a high stress position for a tech comp...,<p>Being in this position is tough. If seeking...
277,I feel that I am struggling with undiagnosed b...,It sounds like you are experiencing a great de...
1331,Does counseling really do anything that can he...,I love this question! Some people shy away fro...


### Drop Empty Rows

In [229]:
# Drop rows with missing (NaN) question or answer, in place
counsel_chat_df.dropna(subset=["questionText", "answerText"], inplace=True)

# Remove rows where either column is empty, whitespace, or answer is just <p><br></p>
counsel_chat_df = counsel_chat_df[
    (counsel_chat_df["questionText"].str.strip() != "") &
    (counsel_chat_df["answerText"].str.strip() != "") &
    (counsel_chat_df["answerText"].str.strip() != "<p><br></p>")
]

In [230]:

counsel_chat_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1379 non-null   object
 1   answerText    1379 non-null   object
dtypes: object(2)
memory usage: 32.3+ KB


In [231]:
counsel_chat_df.describe()


Unnamed: 0,questionText,answerText
count,1379,1379
unique,718,1379
top,I've gone to a couple therapy sessions so far ...,<p>What you are describing is something psycho...
freq,22,1


### Clean HTML

In [232]:
import re
import html

def clean_html(text):
    """
    Cleans HTML tags and entities from the input, then fixes spacing between sentences.

    Parameters
    ----------
    - text: str or None
        Raw string possibly containing HTML and HTML entities.

    Returns
    -------
    - cleaned_text: str
        Cleaned string, with HTML removed and sentence spacing fixed.
    """
    if pd.isna(text):
        return ""
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Convert HTML entities to unicode
    text = html.unescape(text)
    # Fix missing space between sentences (e.g., "word.Something" → "word. Something")
    text = re.sub(r'([.!?])([A-Z0-9])', r'\1 \2', text)
    # Strip leading/trailing whitespace
    return text.strip()

# Clean answerText column in-place
counsel_chat_df['answerText'] = counsel_chat_df['answerText'].apply(clean_html)


In [233]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1379 non-null   object
 1   answerText    1379 non-null   object
dtypes: object(2)
memory usage: 32.3+ KB


In [234]:
counsel_chat_df.describe()

Unnamed: 0,questionText,answerText
count,1379,1379
unique,718,1379
top,I've gone to a couple therapy sessions so far ...,What you are describing is something psycholog...
freq,22,1


## Final Counsel Chat

In [235]:
# Reset index in place (overwrite the DataFrame with the re-indexed one)
counsel_chat_df.reset_index(drop=True, inplace=True)

print(f"Counsel Chat: {counsel_chat_df.shape}")

Counsel Chat: (1379, 2)


In [236]:
counsel_chat_df.sample(10)

Unnamed: 0,questionText,answerText
139,"Tonight, my husband seemed to put our son down...","I have had these many cases, but in situations..."
1071,We've been in a long distance relationship for...,The best way to get an answer is to just ask. ...
59,When having sex I think of other men and want ...,Sustaining passion in marriage is tough stuff....
535,I feel like every time I do something someone ...,It sounds like you have the perception that pe...
557,I am currently suffering from erectile dysfunc...,"Hello, and thank you for your question. I comp..."
759,"For the past four weeks, I've been having nigh...",Write down your nightmares and discuss them wi...
1060,My fiancé and I have been together for 3 years...,It's possible but challenging. Both partners n...
623,"It was over 20 years ago, but the pain has res...",A mediated safe talk session between. You and ...
576,I have PTSD from childhood events and other tr...,"When it comes to trauma, especially in the eve..."
264,"My mom and I have been fighting a lot now, and...",The best way to work on a relationship is for ...


# Normalize Column Names Across All 3 Mental Health Datasets

In [237]:
# Normalize column names across all datasets for consistency
interview_df.rename(columns={'input': 'Context', 'output': 'Response'}, inplace=True)
counsel_chat_df.rename(columns={'questionText': 'Context', 'answerText': 'Response'}, inplace=True)


In [238]:
display(nlp_convos_df.info())
display(interview_df.info())
display(counsel_chat_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740 entries, 0 to 2739
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   2740 non-null   object
 1   Response  2740 non-null   object
dtypes: object(2)
memory usage: 42.9+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6233 entries, 0 to 6232
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   6233 non-null   object
 1   Response  6233 non-null   object
dtypes: object(2)
memory usage: 97.5+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   1379 non-null   object
 1   Response  1379 non-null   object
dtypes: object(2)
memory usage: 21.7+ KB


None

In [239]:
total = len(nlp_convos_df) + len(interview_df) + len(counsel_chat_df)
goal = 17500
print(f"Currently have {total} out of the desired {goal}.\nWe still need {goal - total} more conversations to reach our goal.")

Currently have 10352 out of the desired 17500.
We still need 7148 more conversations to reach our goal.


# Reading in the Pair Dataset

## exploring pair_df

In [240]:
import pandas as pd

# Read CSV
pair_path = '../datasets/custom_corpus/chatbot_sets/mental_health_sets/pair_dataset/pair_data.csv'
pair_df = pd.read_csv(pair_path)

# Quick check
display(pair_df.sample(5))

Unnamed: 0,prompt,hq1,hq2,mq1,lq1,lq2,lq3,lq4,lq5
215,I don’t have a drinking problem. I drink less ...,At this point you’re not concerned about the a...,Growing up with an alcoholic as a father has h...,You're not convinced you have a problem with a...,You're convinced you're not an alcoholic. You ...,How do you know you're seeing yourself clearly?,"If your friends are alcoholics, then why would...",That's great that you aren't drinking as much ...,Are there any instances where you can think of...
49,"I smoked throughout my previous pregnancy, and...",Since you smoked during your last pregnancy an...,There's absolutely no reason for you to quit s...,You smoked during your last pregnancy so smoki...,Your child is still too young to know what the...,Do you know the long-term impacts your child m...,Wrong and you know very well what you did was ...,"Saying ""I did it and was fine"" is no excuse. T...",I'm sure it was fine but you don't realize the...
68,"I tried to quit smoking several times, however...",You weren't successful with quitting in the pa...,Getting pregnant gave you all the motivation y...,"You've tried many times to quit smoking, but r...","People who stop smoking for someone else, end ...",What are some better reasons besides the baby ...,That's great to hear. You will not regret maki...,"That's right, do it for your child, good thing...",Im really glad to hear that
100,"I don’t have to worry, the breast cancer is on...",If the history of breast cancer was on your mo...,You don't think you can inherit breast cancer ...,Since your family’s history of breast cancer i...,You don't understand how breast cancer Gene mu...,Did you know you can also inherit breast cance...,"Don't think like that and that's wrong, always...","Actually, breast cancer can be caused by a num...",I think you should speak to a doctor about this
216,What I like to do for fun is hang out with my ...,You’d only be concerned about your partying if...,Having a good time with your friends is import...,You like to hang out with your friends and par...,Next time the consequences could be worse. The...,Have you considered that something bad did hap...,That is something bad happened and this time y...,"If the accident was through no fault of yours,...","That sounds pretty bad, and expensive. I'm gla..."


In [241]:
pair_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   prompt  318 non-null    object
 1   hq1     318 non-null    object
 2   hq2     318 non-null    object
 3   mq1     318 non-null    object
 4   lq1     318 non-null    object
 5   lq2     318 non-null    object
 6   lq3     318 non-null    object
 7   lq4     318 non-null    object
 8   lq5     318 non-null    object
dtypes: object(9)
memory usage: 22.5+ KB


In [242]:
# Drop unwanted columns in-place
pair_df.drop(columns=['mq1', 'lq1', 'lq2', 'lq3', 'lq4', 'lq5'], inplace=True)

In [243]:
display(pair_df.head())   
display(pair_df.tail())

Unnamed: 0,prompt,hq1,hq2
0,"I know I am too big, and I probably should exe...",You are starting to think it’s time to do some...,You have put a lot of effort into losing weigh...
1,I don’t trust doctors. I don’t trust the CDC....,You feel that your immune systems is strong en...,Putting your trust in others is hard for you. ...
2,"Doctor x, I don’t want the Covid vaccine for m...",You are worried about long term effects from t...,You're concerned that not enough research has ...
3,I know I should probably get the vaccine. I ha...,While you have some concerns about the vaccine...,"You want to protect your mom, but you're worri..."
4,I know because I am so heavy and have diabetes...,Although part of you knows exercise might help...,because of your crazy schedule with the kids a...


Unnamed: 0,prompt,hq1,hq2
313,"I don’t eat any lunchmeat anymore, bread hardl...",You've really tried to change your diet in hop...,You've putting a lot of effort into eating hea...
314,"Well, I read canned vegetables are better than...",You've done research to try to figure out what...,Learning about what you put in your body is im...
315,I can do that when I’m at the store.,You're committed to doing it the next time you...,You're starting to think of how you can put th...
316,"I’d like to eat less processed foods, but I gu...",While you are ready to start eating fewer proc...,You'd really like to learn more about processe...
317,"I like going to the gym, I like exercising; I ...",You know that exericse is good for you and you...,You're already exercising a couple hours a wee...


In [244]:
pair_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   prompt  318 non-null    object
 1   hq1     318 non-null    object
 2   hq2     318 non-null    object
dtypes: object(3)
memory usage: 7.6+ KB


In [245]:
pair_df.describe()

Unnamed: 0,prompt,hq1,hq2
count,318,318,318
unique,318,317,317
top,"I know I am too big, and I probably should exe...",It is difficult for you to take the medication...,"After a busy week, you really need an outlet -..."
freq,1,2,2


### Restructure Pair dataset into two columns

In [246]:
# Prepare first part: prompt + hq1 renamed to Response
pair_hq1 = pair_df[['prompt', 'hq1']].rename(columns={'hq1': 'Response'})

# Prepare second part: prompt + hq2 renamed to Response
pair_hq2 = pair_df[['prompt', 'hq2']].rename(columns={'hq2': 'Response'})

# Concatenate the two datasets (this creates a new DataFrame, so assign back to df)
pair_df = pd.concat([pair_hq1, pair_hq2], ignore_index=True)

# Rename 'prompt' column to 'Context'
pair_df.rename(columns={'prompt': 'Context'}, inplace=True)


In [247]:
# Check result
display(pair_df.head(5))
display(pair_df.tail(5))

Unnamed: 0,Context,Response
0,"I know I am too big, and I probably should exe...",You are starting to think it’s time to do some...
1,I don’t trust doctors. I don’t trust the CDC....,You feel that your immune systems is strong en...
2,"Doctor x, I don’t want the Covid vaccine for m...",You are worried about long term effects from t...
3,I know I should probably get the vaccine. I ha...,While you have some concerns about the vaccine...
4,I know because I am so heavy and have diabetes...,Although part of you knows exercise might help...


Unnamed: 0,Context,Response
631,"I don’t eat any lunchmeat anymore, bread hardl...",You've putting a lot of effort into eating hea...
632,"Well, I read canned vegetables are better than...",Learning about what you put in your body is im...
633,I can do that when I’m at the store.,You're starting to think of how you can put th...
634,"I’d like to eat less processed foods, but I gu...",You'd really like to learn more about processe...
635,"I like going to the gym, I like exercising; I ...",You're already exercising a couple hours a wee...


In [248]:
pair_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 636 entries, 0 to 635
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   636 non-null    object
 1   Response  636 non-null    object
dtypes: object(2)
memory usage: 10.1+ KB


In [249]:
pair_df.describe()

Unnamed: 0,Context,Response
count,636,636
unique,318,632
top,"I know I am too big, and I probably should exe...","After a busy week, you really need an outlet -..."
freq,2,2


In [250]:
pair_df = pair_df.reset_index(drop=True)
pair_df.sample(10)

Unnamed: 0,Context,Response
136,I saw another text on my boyfriend’s phone fro...,Reading that message on your boyfriend’s phone...
316,"I’d like to eat less processed foods, but I gu...",While you are ready to start eating fewer proc...
168,I heard that you have to eliminate all carbs i...,You are looking for ways to lose weight that w...
503,"Well, this heart thing has really affected wha...",You're missing your favorite foods but underst...
90,"I am so busy getting oriented to college life,...",You’re still learning the ropes as a new colle...
395,This is not the time for me to talk about smok...,You have way too much on your plate right now ...
245,"So, have you ever smoked, or had to quit, how ...",You’re concerned I can’t understand you or won...
11,"Yes, I know all that TV and all those video ga...",You know that you can’t be the kind of employe...
263,Of course I want things to be different but I ...,You want things to get better but you want to ...
207,I’ve thought about it but don’t know if it’s f...,If organ donation was discussed in the bible a...


In [251]:
print(f"Pair Dataset: {pair_df.shape}")


Pair Dataset: (636, 2)


In [252]:
total += len(pair_df)

print(f"Currently have {total} out of the desired {goal}.\nWe still need {goal - total} more conversations to reach our goal.")

Currently have 10988 out of the desired 17500.
We still need 6512 more conversations to reach our goal.
