# Read in the datasets

In [1]:
import pandas as pd

# Load NLP Mental Health Conversations
nlp_convos_df = pd.read_csv(
    "../datasets/custom_corpus/NLP_mental_health_convos/nlp_convos.csv"
)

# Load Mental Chat 16k (Interview 6k subset)
interview_df = pd.read_csv(
    "../datasets/custom_corpus/mental_chat16k/interview_data_6k.csv"
)

# Load counsel_chat (clean version)
counsel_chat_df = pd.read_csv(
    "../datasets/custom_corpus/counsel_chat/counsel_chat-data.csv"
)

print(f"NLP Mental Health Convos: {nlp_convos_df.shape}")
print(f"Interview 6k: {interview_df.shape}")
print(f"Counsel Chat: {counsel_chat_df.shape}")


NLP Mental Health Convos: (3512, 2)
Interview 6k: (6310, 3)
Counsel Chat: (1482, 9)


## Explore NLP Mental Health Convos

### Intro to NLP Convos

In [2]:
display(nlp_convos_df.head())
display(nlp_convos_df.tail())

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


Unnamed: 0,Context,Response
3507,My grandson's step-mother sends him to school ...,Absolutely not! It is never in a child's best ...
3508,My boyfriend is in recovery from drug addictio...,I'm sorry you have tension between you and you...
3509,The birth mother attempted suicide several tim...,"The true answer is, ""no one can really say wit..."
3510,I think adult life is making him depressed and...,How do you help yourself to believe you requir...
3511,I just took a job that requires me to travel f...,hmm this is a tough one!


In [3]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3512 non-null   object
 1   Response  3508 non-null   object
dtypes: object(2)
memory usage: 55.0+ KB


In [4]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,3512,3508
unique,995,2471
top,I have so many issues to address. I have a his...,Counseling ends when the client has received t...
freq,94,3


## Cleaning NLP Convos

### Remove rows with missing values

In [5]:
# Remove rows where 'Response' is null or empty
nlp_convos_df = nlp_convos_df.dropna(subset=["Response"])
nlp_convos_df = nlp_convos_df[nlp_convos_df["Response"].str.strip().astype(bool)]


In [6]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3508 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3508 non-null   object
 1   Response  3508 non-null   object
dtypes: object(2)
memory usage: 82.2+ KB


In [7]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,3508,3508
unique,995,2471
top,I have so many issues to address. I have a his...,Counseling ends when the client has received t...
freq,94,3


### Remove Complete Duplicates (rows where both Context AND Response are repeats)

In [8]:
# Remove exact duplicate prompt/response pairs
nlp_convos_df = nlp_convos_df.drop_duplicates(subset=["Context", "Response"])


In [9]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2740 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   2740 non-null   object
 1   Response  2740 non-null   object
dtypes: object(2)
memory usage: 64.2+ KB


In [10]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,2740,2740
unique,995,2471
top,I have so many issues to address. I have a his...,I'm sorry to hear that your brother has been h...
freq,47,2


## Final NLP Convo set

In [11]:
nlp_convos_df = nlp_convos_df.reset_index(drop=True)

print(f"NLP Mental Health Convos: {nlp_convos_df.shape}")


NLP Mental Health Convos: (2740, 2)


In [12]:
nlp_convos_df.sample(10)

Unnamed: 0,Context,Response
1646,My ex-boyfriend and I met over a year ago. We ...,I'm truly sorry to hear that your relationship...
1595,My boyfriend and I have a toxic relationship d...,I'm glad you recognize your own best interest ...
1464,"I'm always listening to my husband, but it fee...",Thanks for reaching out. This is a great quest...
1166,"My boyfriend won't touch me intimately, he won...",Ask him!Having a conversation about sexual sat...
156,I am a teenager. I have been experiencing majo...,I understand that talking about our feelings c...
2306,It's been like a couple of years that I've bee...,"Hello, and thank you for your question. Bipola..."
1134,My boyfriend has this close female friend. It ...,"Well, as hurtful as witnessing your boyfriend'..."
706,I am divorced and happily remarried. Our blend...,"Hi, I understand what you're going through. Yo..."
1299,I loved him more than anything. He passed away...,I am so sorry to hear of your boyfriend's pass...
1011,"Every time my partner gets angry for anything,...",One thing you could try is to talk to your par...


## Explore Interview 6K

## Intro to Interview 6k

In [13]:
display(interview_df.head())
display(interview_df.tail())

Unnamed: 0,instruction,input,output
0,You are a helpful mental health counselling as...,I've been struggling with my mental health for...,I understand that you've been dealing with a s...
1,You are a helpful mental health counselling as...,I've been feeling overwhelmed with my caregivi...,"Your situation is complex, and it's important ..."
2,You are a helpful mental health counselling as...,I've been feeling constantly anxious and unabl...,I can see that you're dealing with a great dea...
3,You are a helpful mental health counselling as...,"My mom has Alzheimer's, and I've been her prim...",I'm sorry to hear that your siblings' demands ...
4,You are a helpful mental health counselling as...,"I've tried setting boundaries, but it feels li...","Your concerns are valid, and it's crucial to p..."


Unnamed: 0,instruction,input,output
6305,You are a helpful mental health counselling as...,I'm feeling overwhelmed and guilty about my un...,It's understandable that you're feeling a rang...
6306,You are a helpful mental health counselling as...,"I've been feeling overwhelmed lately, and I've...",I understand that you've been feeling overwhel...
6307,You are a helpful mental health counselling as...,"I understand what you're saying, but I can't s...",Your feelings of guilt and neglect are underst...
6308,You are a helpful mental health counselling as...,I've been thinking about my relationship with ...,It's important to acknowledge the emotional la...
6309,You are a helpful mental health counselling as...,I've been feeling stressed out lately because ...,Your words paint a vivid picture of the comple...


In [14]:
interview_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6310 entries, 0 to 6309
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  6310 non-null   object
 1   input        6310 non-null   object
 2   output       6310 non-null   object
dtypes: object(3)
memory usage: 148.0+ KB


In [15]:
interview_df.describe()

Unnamed: 0,instruction,input,output
count,6310,6310,6310
unique,1,6111,6210
top,You are a helpful mental health counselling as...,I've been dealing with a lot of logistical tas...,It's essential to recognize that caring for a ...
freq,6310,16,9


## Cleaning Interview Data

In [16]:
# Remove "Instruction" column
interview_df = interview_df.drop(columns=['instruction'])

# Remove exact duplicate prompt/response pairs
interview_df = interview_df.drop_duplicates(subset=['input', 'output'])



In [17]:
interview_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6233 entries, 0 to 6309
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   6233 non-null   object
 1   output  6233 non-null   object
dtypes: object(2)
memory usage: 146.1+ KB


In [18]:
interview_df.describe()


Unnamed: 0,input,output
count,6233,6233
unique,6111,6210
top,I've been feeling hopeless and suicidal lately...,Your concern for your mother's wellbeing is a ...
freq,8,6


## Final Interview 6k

In [19]:
interview_df = interview_df.reset_index(drop=True)
print(f"Interview 6k: {interview_df.shape}")


Interview 6k: (6233, 2)


In [20]:
interview_df.sample(10)

Unnamed: 0,input,output
827,I've been struggling with strong emotions and ...,"Your struggle is understandable, and it's impo..."
346,I've been struggling with feelings of guilt an...,"I'm sorry for your loss, and I can see how dee..."
4539,"Thank you for your guidance, counselor. I feel...","You're very welcome, patient. I'm here to help..."
500,I've been feeling overwhelmed with my workload...,Your concerns about the impact of your workloa...
3275,I've been feeling like I'm constantly failing ...,"Your feelings of guilt, inadequacy, and hopele..."
4428,I've always had a complicated relationship wit...,"Your concerns are valid, and it's clear that t..."
4418,I've been feeling lost and alone since my wife...,I can imagine how difficult it is for you to n...
1963,"Yes, it does. I appreciate your perspective, a...","Absolutely, those are excellent ideas. Revisit..."
5677,I've been having trouble keeping track of my a...,I understand that managing your schedule and r...
4202,I've been dealing with feelings of isolation a...,"Your situation is undoubtedly challenging, and..."


### Explore Counsel Chat

### Intro to Counsel Chat

In [21]:
display(counsel_chat_df.head())
display(counsel_chat_df.tail())

Unnamed: 0,questionID,questionTitle,questionText,questionUrl,topics,therapistName,therapistUrl,answerText,upvotes
0,5566fab2a64752d71ec3ca69,Escalating disagreements between mother and wife,My wife and mother are having tense disagreeme...,https://counselchat.com/questions/escalating-d...,Family Conflict,"Kristi King-Morgan, LMSW",https://counselchat.com/therapists/kristi-king...,<p>What you are describing is something psycho...,0
1,5566f94fa64752d71ec3ca64,I'm addicted to smoking. How can I stop?,"I'm planning to have baby, so I have to quit s...",https://counselchat.com/questions/i-m-addicted...,"Substance Abuse,Addiction",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi. Good for you in planning ahead to do wh...,0
2,5567d26887a1cc0c3f3d8f46,Keeping secrets from my family,"I have secrets in my mind, and I don't know wh...",https://counselchat.com/questions/keeping-secr...,Family Conflict,Jeevna Bajaj,https://counselchat.com/therapists/jeevna-bajaj,<p>It sounds like keeping the secrets has beco...,0
3,556bed15c969ba5861709df5,The Underlying Causes of Being Possessive,I am extremely possessive in my relationships ...,https://counselchat.com/questions/the-underlyi...,"Behavioral Change,Social Relationships",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi there. It's great you are able to realiz...,0
4,556ba115c969ba5861709de6,Can I control anxiety without medication?,I had a head injury a few years ago and my min...,https://counselchat.com/questions/can-i-contro...,Anxiety,Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>You didn't say what or how many medications...,0


Unnamed: 0,questionID,questionTitle,questionText,questionUrl,topics,therapistName,therapistUrl,answerText,upvotes
1477,56d2f2aa9471b0b41ec68e4d,Is it healthy to embarrass a child as punishment?,My grandson's step-mother sends him to school ...,https://counselchat.com/questions/is-it-health...,"Parenting,Family Conflict",Candice Lawhorn,https://counselchat.com/therapists/candice-law...,<p>Absolutely not!&nbsp;</p><p>It is never in ...,0
1478,5797a411b43cd7825e26e246,How do I fix my relationship?,My boyfriend is in recovery from drug addictio...,https://counselchat.com/questions/how-do-i-fix...,"Relationships,Addiction","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,<p>I'm sorry you have tension between you and ...,0
1479,5796a111bc069dff6a5339ca,What are the long term effects of losing one's...,The birth mother attempted suicide several tim...,https://counselchat.com/questions/what-are-the...,"Family Conflict,Parenting,Children & Adolescents","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,"<p>The true answer is, ""no one can really say ...",0
1480,5795952cbc069dff6a5339aa,How do I help my 20 year old boyfriend who say...,I think adult life is making him depressed and...,https://counselchat.com/questions/how-do-i-hel...,"Relationships,Depression,Substance Abuse","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,<p>How do you help yourself to believe you req...,0
1481,5773e438b9ff751f196e8df0,I'm worried about my new job.,I just took a job that requires me to travel f...,https://counselchat.com/questions/i-m-worried-...,"Anxiety,Career Counseling",Philip Kolba,https://counselchat.com/therapists/philip-kolba,<p>hmm this is a tough one!</p>,0


In [22]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   questionID     1482 non-null   object
 1   questionTitle  1480 non-null   object
 2   questionText   1383 non-null   object
 3   questionUrl    1482 non-null   object
 4   topics         1472 non-null   object
 5   therapistName  1482 non-null   object
 6   therapistUrl   1482 non-null   object
 7   answerText     1482 non-null   object
 8   upvotes        1482 non-null   int64 
dtypes: int64(1), object(8)
memory usage: 104.3+ KB


In [23]:
counsel_chat_df.describe()

Unnamed: 0,upvotes
count,1482.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


### Cleaning Counsel Chat Data

In [24]:
counsel_chat_df = counsel_chat_df.drop(columns=['questionID', 'questionTitle', 'questionUrl', 'topics', 'therapistName', 'therapistUrl', 'upvotes'])


In [25]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1383 non-null   object
 1   answerText    1482 non-null   object
dtypes: object(2)
memory usage: 23.3+ KB


In [26]:
(counsel_chat_df.describe())

Unnamed: 0,questionText,answerText
count,1383,1482
unique,718,1478
top,I've gone to a couple therapy sessions so far ...,<p><br></p>
freq,22,5


In [27]:
display(counsel_chat_df.sample(10))

Unnamed: 0,questionText,answerText
1087,"I was raped a couple months ago, Since then, a...",<p>I am so sorry to hear about what happened t...
216,"Back in high school, my friend and I used to m...",<p>Writing about your high school masturbation...
1396,I feel like I am internally screaming all the ...,<p>It sounds like you are trying to find a lab...
357,I have been diagnosed with posttraumatic stres...,<p>You are right on to recognize that the effe...
1033,I have been married for 11 years. Within the p...,<p>Piggybacking on the other respondent's sugg...
1266,My fiancé and I come from a strong Christian b...,<p>One of the sometimes difficult things about...
1348,How does a person start the counseling process?,<p>Hi! Great question! My suggestion would be ...
419,My husband has had issues with alcohol addicti...,"<p>As exasperated as you feel, and as obvious ..."
209,My husband always works. He does work from hom...,<p>I'm glad you're aware to expect more satisf...
450,I got involved with my best friend who is marr...,"<p>Hi Miami, I feel your sadness; you might ha..."


### Drop Empty Rows

In [28]:
# Drop rows with missing (NaN) question or answer, in place
counsel_chat_df.dropna(subset=["questionText", "answerText"], inplace=True)

# Remove rows where either column is empty, whitespace, or answer is just <p><br></p>
counsel_chat_df = counsel_chat_df[
    (counsel_chat_df["questionText"].str.strip() != "") &
    (counsel_chat_df["answerText"].str.strip() != "") &
    (counsel_chat_df["answerText"].str.strip() != "<p><br></p>")
]

In [29]:

counsel_chat_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1379 non-null   object
 1   answerText    1379 non-null   object
dtypes: object(2)
memory usage: 32.3+ KB


In [30]:
counsel_chat_df.describe()


Unnamed: 0,questionText,answerText
count,1379,1379
unique,718,1379
top,I've gone to a couple therapy sessions so far ...,<p>What you are describing is something psycho...
freq,22,1


### Clean HTML

In [31]:
import re
import html

def clean_html(text):
    """
    Cleans HTML tags and entities from the input, then fixes spacing between sentences.

    Parameters
    ----------
    - text: str or None
        Raw string possibly containing HTML and HTML entities.

    Returns
    -------
    - cleaned_text: str
        Cleaned string, with HTML removed and sentence spacing fixed.
    """
    if pd.isna(text):
        return ""
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Convert HTML entities to unicode
    text = html.unescape(text)
    # Fix missing space between sentences (e.g., "word.Something" → "word. Something")
    text = re.sub(r'([.!?])([A-Z0-9])', r'\1 \2', text)
    # Strip leading/trailing whitespace
    return text.strip()

# Clean answerText column in-place
counsel_chat_df['answerText'] = counsel_chat_df['answerText'].apply(clean_html)


In [32]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1379 non-null   object
 1   answerText    1379 non-null   object
dtypes: object(2)
memory usage: 32.3+ KB


In [33]:
counsel_chat_df.describe()

Unnamed: 0,questionText,answerText
count,1379,1379
unique,718,1379
top,I've gone to a couple therapy sessions so far ...,What you are describing is something psycholog...
freq,22,1


## Final Counsel Chat

In [34]:
# Reset index in place (overwrite the DataFrame with the re-indexed one)
counsel_chat_df.reset_index(drop=True, inplace=True)

print(f"Counsel Chat: {counsel_chat_df.shape}")

Counsel Chat: (1379, 2)


In [35]:
counsel_chat_df.sample(10)

Unnamed: 0,questionText,answerText
836,I believe my partner has a masturbation and po...,The comments here from the other therapists ar...
3,I am extremely possessive in my relationships ...,Hi there. It's great you are able to realize t...
524,"He isn't violent, but he has anger issues and ...",Sometimes relationships just do not work. Don...
166,"I was raped by multiple men, and now I can't s...",I am very sorry to hear about your rapes. Tra...
1305,My friend is abusing her prescription medicine...,Your friend needs to admit they have a problem...
281,My mom made a lot of mistakes a couple years b...,"Hi Cleveland, I think I get what you're feelin..."
680,We have been together over a year. We spend ti...,"Hello, and thank you for your question. I am v..."
594,It's been like a couple of years that I've bee...,"Hello, and thank you for your question. Bipola..."
1188,My fiancé and I come from a strong Christian b...,"This suffering and clinging to the past, espec..."
481,Sometimes I can't stop thinking about life aft...,You need to find an outlet. Someone to talk to...


# Normalize Column Names Across All 3 Mental Health Datasets

In [36]:
# Normalize column names across all datasets for consistency
interview_df.rename(columns={'input': 'Context', 'output': 'Response'}, inplace=True)
counsel_chat_df.rename(columns={'questionText': 'Context', 'answerText': 'Response'}, inplace=True)


In [37]:
display(nlp_convos_df.info())
display(interview_df.info())
display(counsel_chat_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740 entries, 0 to 2739
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   2740 non-null   object
 1   Response  2740 non-null   object
dtypes: object(2)
memory usage: 42.9+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6233 entries, 0 to 6232
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   6233 non-null   object
 1   Response  6233 non-null   object
dtypes: object(2)
memory usage: 97.5+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   1379 non-null   object
 1   Response  1379 non-null   object
dtypes: object(2)
memory usage: 21.7+ KB


None

In [38]:
total = len(nlp_convos_df) + len(interview_df) + len(counsel_chat_df)
goal = 17500
print(f"Currently have {total} out of the desired {goal}.\nWe still need {goal - total} more conversations to reach our goal.")

Currently have 10352 out of the desired 17500.
We still need 7148 more conversations to reach our goal.


# Reading in the Pair Dataset

## exploring pair_df

In [39]:
import pandas as pd

# Read CSV
pair_path = '../datasets/custom_corpus/pair_dataset/pair_data.csv'
pair_df = pd.read_csv(pair_path)

# Quick check
display(pair_df.sample(5))

Unnamed: 0,prompt,hq1,hq2,mq1,lq1,lq2,lq3,lq4,lq5
80,"I know my son might have FAP, but I can’t take...",You know that the colonoscopy is really import...,You really don’t want to play a role in gettin...,You're not taking your son for a colonoscopy.,You need to know if your son has FAP. You must...,Don't you care if your son has FAP?,Do what is best indicated by an expert in the ...,I think he needs to have the medical attention...,Everything will be fine soon. Consult a doctor...
14,I have tried every diet under the sun. To be h...,You’ve put a lot of effort into trying to lose...,None of the diets you've tried have worked for...,You don't think you can lose weight on your own.,You won't lose weight with that attitude. Look...,Have you tried counting calories and weighing ...,What if we get you a dietitian that way you ca...,"No, you can do it, but this is something you m...","You must first change your mindset, diet is no..."
162,It does waste a lot of money and my friends ke...,You’re starting to see some of the reasons why...,You realize that smoking is pretty expensive a...,You know that smoking is a waste of money.,Saving money and your health while pleasing yo...,Could you find something you want to buy with ...,Wasting money is a point of view for each pers...,Listen to your friends and try to save that mo...,I think quitting would be best for your health...
241,They tell us that we have to be loyal to them ...,You feel like you put everything on the line a...,You're angry that the military isn't living up...,The military is letting you down.,You did lose a lot of time and energy. Maybe y...,Could you move closer to the VA?,Why won't they pay for your education? You sho...,Feeling as is if they failed you and you owe t...,"I i derstand your frustration, maybe we can fi..."
205,If I sign up to be an organ donor and I’m in a...,You're worried that doctors won't do everythin...,You're considering organ donation but you have...,You're unsure about organ donation.,Your doctor is legally obligated to save your ...,Do you know how many people could be saved by ...,Why do you think that? Don't be so pessimistic.,That's absolutely not true! when you are in da...,"That's a lie, because that's something that do..."


In [40]:
pair_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   prompt  318 non-null    object
 1   hq1     318 non-null    object
 2   hq2     318 non-null    object
 3   mq1     318 non-null    object
 4   lq1     318 non-null    object
 5   lq2     318 non-null    object
 6   lq3     318 non-null    object
 7   lq4     318 non-null    object
 8   lq5     318 non-null    object
dtypes: object(9)
memory usage: 22.5+ KB


In [41]:
# Drop unwanted columns in-place
pair_df.drop(columns=['mq1', 'lq1', 'lq2', 'lq3', 'lq4', 'lq5'], inplace=True)

In [42]:
display(pair_df.head())   
display(pair_df.tail())

Unnamed: 0,prompt,hq1,hq2
0,"I know I am too big, and I probably should exe...",You are starting to think it’s time to do some...,You have put a lot of effort into losing weigh...
1,I don’t trust doctors. I don’t trust the CDC....,You feel that your immune systems is strong en...,Putting your trust in others is hard for you. ...
2,"Doctor x, I don’t want the Covid vaccine for m...",You are worried about long term effects from t...,You're concerned that not enough research has ...
3,I know I should probably get the vaccine. I ha...,While you have some concerns about the vaccine...,"You want to protect your mom, but you're worri..."
4,I know because I am so heavy and have diabetes...,Although part of you knows exercise might help...,because of your crazy schedule with the kids a...


Unnamed: 0,prompt,hq1,hq2
313,"I don’t eat any lunchmeat anymore, bread hardl...",You've really tried to change your diet in hop...,You've putting a lot of effort into eating hea...
314,"Well, I read canned vegetables are better than...",You've done research to try to figure out what...,Learning about what you put in your body is im...
315,I can do that when I’m at the store.,You're committed to doing it the next time you...,You're starting to think of how you can put th...
316,"I’d like to eat less processed foods, but I gu...",While you are ready to start eating fewer proc...,You'd really like to learn more about processe...
317,"I like going to the gym, I like exercising; I ...",You know that exericse is good for you and you...,You're already exercising a couple hours a wee...


In [43]:
pair_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   prompt  318 non-null    object
 1   hq1     318 non-null    object
 2   hq2     318 non-null    object
dtypes: object(3)
memory usage: 7.6+ KB


In [44]:
pair_df.describe()

Unnamed: 0,prompt,hq1,hq2
count,318,318,318
unique,318,317,317
top,"I know I am too big, and I probably should exe...",It is difficult for you to take the medication...,"After a busy week, you really need an outlet -..."
freq,1,2,2


### Restructure Pair dataset into two columns

In [45]:
# Prepare first part: prompt + hq1 renamed to Response
pair_hq1 = pair_df[['prompt', 'hq1']].rename(columns={'hq1': 'Response'})

# Prepare second part: prompt + hq2 renamed to Response
pair_hq2 = pair_df[['prompt', 'hq2']].rename(columns={'hq2': 'Response'})

# Concatenate the two datasets (this creates a new DataFrame, so assign back to df)
pair_df = pd.concat([pair_hq1, pair_hq2], ignore_index=True)

# Rename 'prompt' column to 'Context'
pair_df.rename(columns={'prompt': 'Context'}, inplace=True)


In [46]:
# Check result
display(pair_df.head(5))
display(pair_df.tail(5))

Unnamed: 0,Context,Response
0,"I know I am too big, and I probably should exe...",You are starting to think it’s time to do some...
1,I don’t trust doctors. I don’t trust the CDC....,You feel that your immune systems is strong en...
2,"Doctor x, I don’t want the Covid vaccine for m...",You are worried about long term effects from t...
3,I know I should probably get the vaccine. I ha...,While you have some concerns about the vaccine...
4,I know because I am so heavy and have diabetes...,Although part of you knows exercise might help...


Unnamed: 0,Context,Response
631,"I don’t eat any lunchmeat anymore, bread hardl...",You've putting a lot of effort into eating hea...
632,"Well, I read canned vegetables are better than...",Learning about what you put in your body is im...
633,I can do that when I’m at the store.,You're starting to think of how you can put th...
634,"I’d like to eat less processed foods, but I gu...",You'd really like to learn more about processe...
635,"I like going to the gym, I like exercising; I ...",You're already exercising a couple hours a wee...


In [47]:
pair_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 636 entries, 0 to 635
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   636 non-null    object
 1   Response  636 non-null    object
dtypes: object(2)
memory usage: 10.1+ KB


In [48]:
pair_df.describe()

Unnamed: 0,Context,Response
count,636,636
unique,318,632
top,"I know I am too big, and I probably should exe...","After a busy week, you really need an outlet -..."
freq,2,2


In [49]:
pair_df = pair_df.reset_index(drop=True)
pair_df.sample(10)

Unnamed: 0,Context,Response
328,I know I need to eat better. But I live alon...,You struggle finding meals for one. You'd like...
596,"I know I should quit, I know it’s bad for my b...","In order to quit, it sounds like you'd need to..."
48,I know that 2 cigarettes per day are not a pro...,Limiting the number of cigarettes you have eac...
445,I simply forget. It’s not that I don’t want to...,If you could find a way to remember to take yo...
331,I need to quit these damn cigarettes already. ...,You know that cigarettes aren't great for you ...
261,I know weed can turn you into a zombie and it ...,While you do think smoking helps you when you'...
555,"Well, I went once, I had to wait for over an h...","In the future, you might have a better experie..."
369,"We didn’t plan for this pregnancy, it was all ...",Since you feel the same now as you did before ...
458,You have to carry a knife or some protection w...,You feel safer carrying protection on you. If ...
5,"Of course, I would like to lose weight and not...",You have tried a lot of things and put a lot o...


In [50]:
print(f"Pair Dataset: {pair_df.shape}")


Pair Dataset: (636, 2)


In [51]:
total += len(pair_df)

print(f"Currently have {total} out of the desired {goal}.\nWe still need {goal - total} more conversations to reach our goal.")

Currently have 10988 out of the desired 17500.
We still need 6512 more conversations to reach our goal.


# Time to read in the ZahrizhalAli

In [4]:
import pandas as pd

# Replace 'path/to/your_file.parquet' with the actual path to your Parquet file
zahrizhalAli = pd.read_parquet('../datasets/custom_corpus/chatbot_sets/zahrizhalAli/zahrizhal_ali.parquet')

# Display the first few rows of the DataFrame
display(zahrizhalAli.head())


Unnamed: 0,text
0,<HUMAN>: What is a panic attack?\n<ASSISTANT>:...
1,<HUMAN>: What are symptoms of panic attack vs....
2,<HUMAN>: What are the types of Mental Illness?...
3,<HUMAN>: What does mental-illness mean?\n<ASSI...
4,<HUMAN>: How can you treat mental illness?\n<A...
