# Read in the datasets

In [None]:
import pandas as pd

# Load NLP Mental Health Conversations
nlp_convos_df = pd.read_csv(
    "../datasets/custom_corpus/chatbot_sets/mental_health_sets/NLP_mental_health_convos/nlp_convos.csv"
)

# Load Mental Chat 16k (Interview 6k subset)
interview_df = pd.read_csv(
    "../datasets/custom_corpus/chatbot_sets/mental_health_sets/mental_chat16k/interview_data_6k.csv"
)

# Load counsel_chat (clean version)
counsel_chat_df = pd.read_csv(
    "../datasets/custom_corpus/chatbot_sets/mental_health_sets/counsel_chat/counsel_chat-data.csv"
)

nlp_convos_df = pd.read_csv('nlp_convos.csv').rename(columns={'input': 'Context', 'response': 'Response'})
interview_df = pd.read_csv('interview_data_6k.csv').rename(columns={'prompt': 'Context', 'completion': 'Response'})
counsel_chat_df = counsel_chat_df.rename(columns={'questionText': 'Context', 'answerText': 'Response'})


print(f"NLP Mental Health Convos: {nlp_convos_df.shape}")
print(f"Interview 6k: {interview_df.shape}")
print(f"Counsel Chat: {counsel_chat_df.shape}")


NLP Mental Health Convos: (3512, 2)
Interview 6k: (6310, 3)
Counsel Chat: (1482, 9)


# Explore NLP Mental Health Convos

## Intro to NLP Convos

In [2]:
display(nlp_convos_df.head())
display(nlp_convos_df.tail())

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


Unnamed: 0,Context,Response
3507,My grandson's step-mother sends him to school ...,Absolutely not! It is never in a child's best ...
3508,My boyfriend is in recovery from drug addictio...,I'm sorry you have tension between you and you...
3509,The birth mother attempted suicide several tim...,"The true answer is, ""no one can really say wit..."
3510,I think adult life is making him depressed and...,How do you help yourself to believe you requir...
3511,I just took a job that requires me to travel f...,hmm this is a tough one!


In [3]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3512 non-null   object
 1   Response  3508 non-null   object
dtypes: object(2)
memory usage: 55.0+ KB


In [4]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,3512,3508
unique,995,2471
top,I have so many issues to address. I have a his...,Counseling ends when the client has received t...
freq,94,3


## Cleaning NLP Convos

### Remove rows with missing values

In [5]:
# Remove rows where 'Response' is null or empty
nlp_convos_df = nlp_convos_df.dropna(subset=["Response"])
nlp_convos_df = nlp_convos_df[nlp_convos_df["Response"].str.strip().astype(bool)]


In [6]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3508 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3508 non-null   object
 1   Response  3508 non-null   object
dtypes: object(2)
memory usage: 82.2+ KB


In [7]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,3508,3508
unique,995,2471
top,I have so many issues to address. I have a his...,Counseling ends when the client has received t...
freq,94,3


### Remove Complete Duplicates (rows where both Context AND Response are repeats)

In [8]:
# Remove exact duplicate prompt/response pairs
nlp_convos_df = nlp_convos_df.drop_duplicates(subset=["Context", "Response"])


In [9]:
nlp_convos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2740 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   2740 non-null   object
 1   Response  2740 non-null   object
dtypes: object(2)
memory usage: 64.2+ KB


In [10]:
nlp_convos_df.describe()

Unnamed: 0,Context,Response
count,2740,2740
unique,995,2471
top,I have so many issues to address. I have a his...,I'm sorry to hear that your brother has been h...
freq,47,2


# Final NLP Convo set

In [11]:
nlp_convos_df = nlp_convos_df.reset_index(drop=True)

print(f"NLP Mental Health Convos: {nlp_convos_df.shape}")


NLP Mental Health Convos: (2740, 2)


In [12]:
nlp_convos_df.sample(10)

Unnamed: 0,Context,Response
2204,"They don't go away, and I feel like I'm going ...",This can certainly be a side effect of some me...
505,I've been dealing with this for years. My mom ...,How old are you?Are you old enough in your Sta...
2438,I have a mother who is still running my life e...,ABSOLUTELY. Having someone who's outside the ...
1893,How do you know you have the right therapist f...,"How do you know anything is ""right"" for you? I..."
1744,Often times I find myself thinking scary thoug...,"Hi there,I first want to let you know that hav..."
933,"Maybe this is a stupid question, but I sometim...",Thanks for posting. This is a significant iss...
2615,I have a friend that who I used to be in a rel...,It's completely understandable that you would ...
2539,I have been married for 11 years. Within the p...,Absolutely. Your question shows that you are g...
2117,He said he would try and he never did. It's be...,Wow that is tough. There is nothing worse than...
1978,Does counseling really do anything that can he...,Counseling can go a long way in improving your...


# Explore Interview 6K

## Intro to Interview 6k

In [13]:
display(interview_df.head())
display(interview_df.tail())

Unnamed: 0,instruction,input,output
0,You are a helpful mental health counselling as...,I've been struggling with my mental health for...,I understand that you've been dealing with a s...
1,You are a helpful mental health counselling as...,I've been feeling overwhelmed with my caregivi...,"Your situation is complex, and it's important ..."
2,You are a helpful mental health counselling as...,I've been feeling constantly anxious and unabl...,I can see that you're dealing with a great dea...
3,You are a helpful mental health counselling as...,"My mom has Alzheimer's, and I've been her prim...",I'm sorry to hear that your siblings' demands ...
4,You are a helpful mental health counselling as...,"I've tried setting boundaries, but it feels li...","Your concerns are valid, and it's crucial to p..."


Unnamed: 0,instruction,input,output
6305,You are a helpful mental health counselling as...,I'm feeling overwhelmed and guilty about my un...,It's understandable that you're feeling a rang...
6306,You are a helpful mental health counselling as...,"I've been feeling overwhelmed lately, and I've...",I understand that you've been feeling overwhel...
6307,You are a helpful mental health counselling as...,"I understand what you're saying, but I can't s...",Your feelings of guilt and neglect are underst...
6308,You are a helpful mental health counselling as...,I've been thinking about my relationship with ...,It's important to acknowledge the emotional la...
6309,You are a helpful mental health counselling as...,I've been feeling stressed out lately because ...,Your words paint a vivid picture of the comple...


In [14]:
interview_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6310 entries, 0 to 6309
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  6310 non-null   object
 1   input        6310 non-null   object
 2   output       6310 non-null   object
dtypes: object(3)
memory usage: 148.0+ KB


In [15]:
interview_df.describe()

Unnamed: 0,instruction,input,output
count,6310,6310,6310
unique,1,6111,6210
top,You are a helpful mental health counselling as...,I've been dealing with a lot of logistical tas...,It's essential to recognize that caring for a ...
freq,6310,16,9


## Cleaning Interview Data

In [16]:
# Remove "Instruction" column
interview_df = interview_df.drop(columns=['instruction'])

# Remove exact duplicate prompt/response pairs
interview_df = interview_df.drop_duplicates(subset=['input', 'output'])



In [17]:
interview_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6233 entries, 0 to 6309
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   6233 non-null   object
 1   output  6233 non-null   object
dtypes: object(2)
memory usage: 146.1+ KB


In [18]:
interview_df.describe()


Unnamed: 0,input,output
count,6233,6233
unique,6111,6210
top,I've been feeling hopeless and suicidal lately...,Your concern for your mother's wellbeing is a ...
freq,8,6


# Final Interview 6k

In [19]:
interview_df = interview_df.reset_index(drop=True)
print(f"Interview 6k: {interview_df.shape}")


Interview 6k: (6233, 2)


In [20]:
interview_df.sample(10)

Unnamed: 0,input,output
2746,I've been dealing with the loss of my mother f...,Grief is a complex and deeply personal process...
1655,I've been struggling with feelings of guilt an...,"I'm sorry for your loss, and I can see how the..."
1913,"Yes, I've thought about that. But what's been ...",It's important to acknowledge the feelings of ...
3650,I've always prided myself on my mother's consi...,Your mother's communication style has been a s...
1863,"I appreciate your guidance, and I'm excited to...",Your proactive approach to addressing the need...
4294,I've recently been transferred to a new Hospic...,I can imagine how unsettling it is to receive ...
1896,I've been struggling to find common ground wit...,"Your situation is complex, and it's crucial to..."
5850,I've been struggling with this issue for quite...,I can see how this issue is causing you a grea...
1074,"I've hired six helpers a day, but their presen...","Your house is your refuge, a place where you s..."
2593,"I'm worried about my wife, [Name]. She's not e...","Your concern for your wife, [Name], and her ea..."


# Explore Counsel Chat

## Intro to Counsel Chat

In [21]:
display(counsel_chat_df.head())
display(counsel_chat_df.tail())

Unnamed: 0,questionID,questionTitle,questionText,questionUrl,topics,therapistName,therapistUrl,answerText,upvotes
0,5566fab2a64752d71ec3ca69,Escalating disagreements between mother and wife,My wife and mother are having tense disagreeme...,https://counselchat.com/questions/escalating-d...,Family Conflict,"Kristi King-Morgan, LMSW",https://counselchat.com/therapists/kristi-king...,<p>What you are describing is something psycho...,0
1,5566f94fa64752d71ec3ca64,I'm addicted to smoking. How can I stop?,"I'm planning to have baby, so I have to quit s...",https://counselchat.com/questions/i-m-addicted...,"Substance Abuse,Addiction",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi. Good for you in planning ahead to do wh...,0
2,5567d26887a1cc0c3f3d8f46,Keeping secrets from my family,"I have secrets in my mind, and I don't know wh...",https://counselchat.com/questions/keeping-secr...,Family Conflict,Jeevna Bajaj,https://counselchat.com/therapists/jeevna-bajaj,<p>It sounds like keeping the secrets has beco...,0
3,556bed15c969ba5861709df5,The Underlying Causes of Being Possessive,I am extremely possessive in my relationships ...,https://counselchat.com/questions/the-underlyi...,"Behavioral Change,Social Relationships",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi there. It's great you are able to realiz...,0
4,556ba115c969ba5861709de6,Can I control anxiety without medication?,I had a head injury a few years ago and my min...,https://counselchat.com/questions/can-i-contro...,Anxiety,Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>You didn't say what or how many medications...,0


Unnamed: 0,questionID,questionTitle,questionText,questionUrl,topics,therapistName,therapistUrl,answerText,upvotes
1477,56d2f2aa9471b0b41ec68e4d,Is it healthy to embarrass a child as punishment?,My grandson's step-mother sends him to school ...,https://counselchat.com/questions/is-it-health...,"Parenting,Family Conflict",Candice Lawhorn,https://counselchat.com/therapists/candice-law...,<p>Absolutely not!&nbsp;</p><p>It is never in ...,0
1478,5797a411b43cd7825e26e246,How do I fix my relationship?,My boyfriend is in recovery from drug addictio...,https://counselchat.com/questions/how-do-i-fix...,"Relationships,Addiction","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,<p>I'm sorry you have tension between you and ...,0
1479,5796a111bc069dff6a5339ca,What are the long term effects of losing one's...,The birth mother attempted suicide several tim...,https://counselchat.com/questions/what-are-the...,"Family Conflict,Parenting,Children & Adolescents","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,"<p>The true answer is, ""no one can really say ...",0
1480,5795952cbc069dff6a5339aa,How do I help my 20 year old boyfriend who say...,I think adult life is making him depressed and...,https://counselchat.com/questions/how-do-i-hel...,"Relationships,Depression,Substance Abuse","Sherry Katz, LCSW",https://counselchat.com/therapists/sherry-katz...,<p>How do you help yourself to believe you req...,0
1481,5773e438b9ff751f196e8df0,I'm worried about my new job.,I just took a job that requires me to travel f...,https://counselchat.com/questions/i-m-worried-...,"Anxiety,Career Counseling",Philip Kolba,https://counselchat.com/therapists/philip-kolba,<p>hmm this is a tough one!</p>,0


In [22]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   questionID     1482 non-null   object
 1   questionTitle  1480 non-null   object
 2   questionText   1383 non-null   object
 3   questionUrl    1482 non-null   object
 4   topics         1472 non-null   object
 5   therapistName  1482 non-null   object
 6   therapistUrl   1482 non-null   object
 7   answerText     1482 non-null   object
 8   upvotes        1482 non-null   int64 
dtypes: int64(1), object(8)
memory usage: 104.3+ KB


In [23]:
counsel_chat_df.describe()

Unnamed: 0,upvotes
count,1482.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


## Cleaning Counsel Chat Data

In [24]:
counsel_chat_df = counsel_chat_df.drop(columns=['questionID', 'questionTitle', 'questionUrl', 'topics', 'therapistName', 'therapistUrl', 'upvotes'])


In [25]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1383 non-null   object
 1   answerText    1482 non-null   object
dtypes: object(2)
memory usage: 23.3+ KB


In [26]:
(counsel_chat_df.describe())

Unnamed: 0,questionText,answerText
count,1383,1482
unique,718,1478
top,I've gone to a couple therapy sessions so far ...,<p><br></p>
freq,22,5


In [27]:
display(counsel_chat_df.sample(10))

Unnamed: 0,questionText,answerText
1277,I've gone to a couple therapy sessions so far ...,<p>If you have only been to a couple of sessio...
903,I'm a teenager. My dad has been jail for the l...,<p>It sounds like you have a lot of weight on ...
1064,"At a friend's house, we had some drinks and wa...",<p>I am so sorry that this happened to you and...
527,We’ve been together almost three years. We arg...,<p>Try having a conversation with your boyfrie...
10,Cheating is something unacceptable for me but ...,"<p>First of all, my heart goes out to you. Inf..."
231,"When I see something I don’t like, I go off li...",<p>Sometimes we react to situations immediatel...
49,Or how to send him somewhere that can help him...,<p>More information would be needed for me to ...
532,I found out my boyfriend takes anti-depression...,<p>Give him the time and space he needs.&nbsp;...
276,I like getting attention from men. I don't hav...,"<p>Hi Michigan, Good for you for recognising a..."
1391,I need help knowing how to deal with stress. W...,<p>Something different works for each of us.</...


### Drop Empty Rows

In [28]:
# Drop rows with missing (NaN) question or answer, in place
counsel_chat_df.dropna(subset=["questionText", "answerText"], inplace=True)

# Remove rows where either column is empty, whitespace, or answer is just <p><br></p>
counsel_chat_df = counsel_chat_df[
    (counsel_chat_df["questionText"].str.strip() != "") &
    (counsel_chat_df["answerText"].str.strip() != "") &
    (counsel_chat_df["answerText"].str.strip() != "<p><br></p>")
]

In [29]:

counsel_chat_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1379 non-null   object
 1   answerText    1379 non-null   object
dtypes: object(2)
memory usage: 32.3+ KB


In [30]:
counsel_chat_df.describe()


Unnamed: 0,questionText,answerText
count,1379,1379
unique,718,1379
top,I've gone to a couple therapy sessions so far ...,<p>What you are describing is something psycho...
freq,22,1


### Clean HTML

In [31]:
import re
import html

def clean_html(text):
    """
    Cleans HTML tags and entities from the input, then fixes spacing between sentences.

    Parameters
    ----------
    - text: str or None
        Raw string possibly containing HTML and HTML entities.

    Returns
    -------
    - cleaned_text: str
        Cleaned string, with HTML removed and sentence spacing fixed.
    """
    if pd.isna(text):
        return ""
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Convert HTML entities to unicode
    text = html.unescape(text)
    # Fix missing space between sentences (e.g., "word.Something" → "word. Something")
    text = re.sub(r'([.!?])([A-Z0-9])', r'\1 \2', text)
    # Strip leading/trailing whitespace
    return text.strip()

# Clean answerText column in-place
counsel_chat_df['answerText'] = counsel_chat_df['answerText'].apply(clean_html)


In [32]:
counsel_chat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, 0 to 1481
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   questionText  1379 non-null   object
 1   answerText    1379 non-null   object
dtypes: object(2)
memory usage: 32.3+ KB


In [33]:
counsel_chat_df.describe()

Unnamed: 0,questionText,answerText
count,1379,1379
unique,718,1379
top,I've gone to a couple therapy sessions so far ...,What you are describing is something psycholog...
freq,22,1


# Final Counsel Chat

In [34]:
# Reset index in place (overwrite the DataFrame with the re-indexed one)
counsel_chat_df.reset_index(drop=True, inplace=True)

print(f"Counsel Chat: {counsel_chat_df.shape}")

Counsel Chat: (1379, 2)


In [38]:
counsel_chat_df.sample(10)

Unnamed: 0,questionText,answerText
334,My boyfriend recently got a kitty. I hate cats...,Sorry for you and sorry for the cat because yo...
8,What do you do when a therapist and a parent d...,I will admit I am confused about this question...
713,Nothing we've tried has worked so far.,Sexual attraction is often discussed on a cont...
885,I found the guy I’m dating on dating websites....,I would suggest possibly talking about the typ...
549,i was addicted to porn since the second grade....,The person who can answer this question is the...
225,I'm a male in my 20s. My girlfriend is in her ...,It sounds like there are assumptions being mad...
864,I want a secure relationship with someone that...,Here are some things I'm wondering:Do you have...
223,I'm applying to private high schools. I'm play...,The situation in your family seems to place un...
1281,Does counseling really do anything that can he...,"Yes, counseling can help people. How this happ..."
1247,How does a person start the counseling process?,Hi! Great question! My suggestion would be to ...


# Normalize Column Names Across All 3 Mental Health Datasets

In [39]:
# Normalize column names across all datasets for consistency
interview_df.rename(columns={'input': 'Context', 'output': 'Response'}, inplace=True)
counsel_chat_df.rename(columns={'questionText': 'Context', 'answerText': 'Response'}, inplace=True)


In [40]:
display(nlp_convos_df.info())
display(interview_df.info())
display(counsel_chat_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740 entries, 0 to 2739
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   2740 non-null   object
 1   Response  2740 non-null   object
dtypes: object(2)
memory usage: 42.9+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6233 entries, 0 to 6232
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   6233 non-null   object
 1   Response  6233 non-null   object
dtypes: object(2)
memory usage: 97.5+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   1379 non-null   object
 1   Response  1379 non-null   object
dtypes: object(2)
memory usage: 21.7+ KB


None