# Merging Different datasets into one

In [139]:
import pandas as pd

## Dataset 1

In [140]:
data_1 = pd.read_json(
    "hf://datasets/Amod/mental_health_counseling_conversations/combined_dataset.json",
    lines=True,
)

In [141]:
print(data_1.shape)
print(data_1.head())

(3512, 2)
                                             Context  \
0  I'm going through some things with my feelings...   
1  I'm going through some things with my feelings...   
2  I'm going through some things with my feelings...   
3  I'm going through some things with my feelings...   
4  I'm going through some things with my feelings...   

                                            Response  
0  If everyone thinks you're worthless, then mayb...  
1  Hello, and thank you for your question and see...  
2  First thing I'd suggest is getting the sleep y...  
3  Therapy is essential for those that are feelin...  
4  I first want to let you know that you are not ...  


In [142]:
# Select first 2500 rows
sampled_data = data_1.iloc[:3000]

# Rename columns for consistency
sampled_data = sampled_data.rename(
    columns={"Context": "context", "Response": "response"}
)

# Save to CSV
sampled_data.to_csv("dataset.csv", index=False)

## Dataset 2

In [143]:
data_2 = pd.read_json("hf://datasets/marmikpandya/mental-health/data.jsonl", lines=True)

In [144]:
print(data_2.shape)
print(data_2)


(1, 13358)
                                               0      \
0  {'instruction': 'If you are a licensed psychol...   

                                               1      \
0  {'instruction': 'If you are a licensed psychol...   

                                               2      \
0  {'instruction': 'If you are a licensed psychol...   

                                               3      \
0  {'instruction': 'If you are a licensed psychol...   

                                               4      \
0  {'instruction': 'If you are a licensed psychol...   

                                               5      \
0  {'instruction': 'If you are a licensed psychol...   

                                               6      \
0  {'instruction': 'If you are a licensed psychol...   

                                               7      \
0  {'instruction': 'If you are a licensed psychol...   

                                               8      \
0  {'instruction': 'If you ar

In [145]:
# Transpose the DataFrame to get proper rows
data_2 = data_2.T.reset_index(drop=True)  # Fix structure

# Debugging: Print new structure
print(data_2.shape)  # Should now be (13358, 1) with properly structured rows
print(data_2.head())  # Check if each row has {'instruction', 'input', 'output'}

# Extract relevant fields and limit to 2500 rows
processed_data_2 = pd.DataFrame(
    {
        "context": data_2[0].apply(lambda x: f"{x['instruction']} {x['input']}"),
        "response": data_2[0].apply(lambda x: x["output"]),
    }
).iloc[
    :3000
]  # Take first 2500 rows

# Debugging: Check extracted data
print(processed_data_2.head())

# Load the existing dataset
dataset = pd.read_csv("dataset.csv")

# Append new data
dataset = pd.concat([dataset, processed_data_2], ignore_index=True)

# Save the updated dataset
dataset.to_csv("dataset.csv", index=False)

(13358, 1)
                                                   0
0  {'instruction': 'If you are a licensed psychol...
1  {'instruction': 'If you are a licensed psychol...
2  {'instruction': 'If you are a licensed psychol...
3  {'instruction': 'If you are a licensed psychol...
4  {'instruction': 'If you are a licensed psychol...
                                             context  \
0  If you are a licensed psychologist, please pro...   
1  If you are a licensed psychologist, please pro...   
2  If you are a licensed psychologist, please pro...   
3  If you are a licensed psychologist, please pro...   
4  If you are a licensed psychologist, please pro...   

                                            response  
0  It's common to feel anxious at times, and ther...  
1  It's understandable to feel worried and suspic...  
2  It sounds like you're going through a difficul...  
3  It's important to talk to your doctor about an...  
4  It's common to feel anxious without knowing th...  


## Dataset 3

In [146]:
splits = {
    "train": "data/train-00000-of-00001.parquet",
    "test": "data/test-00000-of-00001.parquet",
}
data_3 = pd.read_parquet(
    "hf://datasets/ramachaitanya22/mental_health_and_fitness_data/" + splits["train"]
)

In [147]:
print(data_3.shape)
print(data_3.head())

(3552, 2)
                                               Human  \
0  can you provide tips for maintaining fitness w...   
1  I start counseling/therapy in a few days (I'm ...   
2  How do you know you have the right therapist f...   
3  Every time I send a message to someone or a gr...   
4  I have so many issues to address. I have a his...   

                                           Assistant  
0  fitness on the go pack resistance bands explor...  
1  Lots of people do cry in session, but your the...  
2  This is a really important question, because y...  
3  Sorry to hear your friends aren't responding t...  
4  You do not have too many issues to address in ...  


In [148]:
# Select first 3000 rows and rename columns
processed_data_3 = data_3.iloc[:3000].rename(
    columns={"Human": "context", "Assistant": "response"}
)

# Load the existing dataset
dataset = pd.read_csv("dataset.csv")

# Append new data
dataset = pd.concat([dataset, processed_data_3], ignore_index=True)

# Save the updated dataset
dataset.to_csv("dataset.csv", index=False)

In [149]:
combined_data = pd.read_csv("dataset.csv")
combined_data.shape

(9000, 2)

In [150]:
combined_data.shape

(9000, 2)