In [None]:
import pandas as pd
import numpy as np


np.random.seed(42)


n_samples = 100                                  
genders = ['male', 'female', 'non-binary']
occupations = ['employed', 'unemployed', 'student', 'retired']
diagnoses = ['depression', 'anxiety', 'bipolar disorder', 'schizophrenia', 'PTSD', 'none']
medications = ['SSRI', 'Benzo', 'Mood stabilizer', 'None']


ages = np.random.randint(18, 65, size=n_samples)

education_levels = ['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']
income_levels = ['No income', 'Low', 'Medium', 'High']


education = np.where(ages < 20, np.random.choice(education_levels, n_samples, p=[0.7, 0.1, 0.2, 0., 0.]),
                     np.random.choice(education_levels, n_samples, p=[0.0, 0.0, 0.5, 0.3, 0.2]))


income = np.where(ages < 18, np.random.choice(income_levels, n_samples, p=[0.6, 0.3, 0.1, 0.0]),
                  np.random.choice(income_levels, n_samples, p=[0.1, 0.2, 0.5, 0.2]))


stress_levels = np.zeros(n_samples)
for i in range(n_samples):
    if education[i] == 'High School':
        stress_levels[i] += np.random.normal(7, 1) 
    elif education[i] == 'Associate':
        stress_levels[i] += np.random.normal(6, 1)
    elif education[i] == 'Bachelor':
        stress_levels[i] += np.random.normal(5, 1)
    elif education[i] == 'Master':
        stress_levels[i] += np.random.normal(4, 1)
    elif education[i] == 'Doctorate':
        stress_levels[i] += np.random.normal(3, 1)  

 
    if income[i] == 'No income':
        stress_levels[i] += 5  
    if income[i] == 'No income':
        stress_levels[i] += 3
    elif income[i] == 'Medium':
        stress_levels[i] += 1 
    elif income[i] == 'High':
        stress_levels[i] -= 2 


    if ages[i] < 25:
        stress_levels[i] += 1 
    elif ages[i] > 50:
        stress_levels[i] -= 1 


stress_levels = np.clip(stress_levels, 1, 10)  


anxiety_levels = np.clip(stress_levels + np.random.normal(0, 1, n_samples), 1, 10)  


genders_sampled = np.random.choice(genders, n_samples, p=[0.5, 0.4, 0.1])
marital_status_sampled = np.where(ages < 30, 
                                  np.random.choice(['single', 'married'], n_samples), 
                                  np.random.choice(['married', 'divorced'], n_samples))


mood_rating = np.random.randint(1, 11, size=n_samples)
diagnosis_sampled = np.random.choice(diagnoses, n_samples)
for i in range(n_samples):
    if diagnosis_sampled[i] in ['depression', 'bipolar disorder']:
        mood_rating[i] = np.random.randint(1, 5)  
    elif diagnosis_sampled[i] == 'anxiety':
        mood_rating[i] = np.random.randint(3, 7) 
    else:
        mood_rating[i] = np.random.randint(5, 11) 


substance_use = np.where(anxiety_levels < 4, 'none',
                         np.where(anxiety_levels < 7, 'occasional', 'frequent'))
coping_mechanisms = np.where(stress_levels < 4, 'exercise',
                             np.where(stress_levels < 7, 'journaling', 'therapy'))
goal_achievement = np.where((anxiety_levels < 5) & (stress_levels < 5), 'yes', 'no')


sleep_hours = np.zeros(n_samples)
for i in range(n_samples):
    if substance_use[i] == 'none':
        sleep_hours[i] = np.clip(np.random.normal(loc=8, scale=1, size=1), 4, 10)
    elif substance_use[i] == 'occasional':
        sleep_hours[i] = np.clip(np.random.normal(loc=6, scale=1, size=1), 3, 9)
    else:  
        sleep_hours[i] = np.clip(np.random.normal(loc=5, scale=1, size=1), 2, 8)


occupations_sampled = []
for i in range(n_samples):
    if education[i] == 'High School':
        occupations_sampled.append(np.random.choice(['employed', 'unemployed'], p=[0.2, 0.8]))
    elif education[i] == 'Associate':
        occupations_sampled.append(np.random.choice(['employed', 'unemployed'], p=[0.3, 0.7]))
    elif education[i] == 'Bachelor':
        occupations_sampled.append(np.random.choice(['employed', 'unemployed'], p=[0.4, 0.6]))
    elif education[i] == 'Master':
        if income[i] == 'No income':
            occupations_sampled.append('unemployed')
        else:
            occupations_sampled.append(np.random.choice(['employed', 'unemployed'], p=[0.7, 0.3]))
    elif education[i] == 'Doctorate':
        if income[i] == 'No income':
            occupations_sampled.append('unemployed')
        else:
            occupations_sampled.append(np.random.choice(['unemployed', 'employed', 'retired'], p=[0.2, 0.5, 0.3]))


data = {
    'patient_id': range(1, n_samples + 1),
    'age': ages,
    'gender': genders_sampled,
    'occupation': occupations_sampled,
    'education_level': education,
    'marital_status': marital_status_sampled,
    'income_level': income,
    'diagnosis': diagnosis_sampled,
    'medications': np.random.choice(medications, n_samples),
    'mood_rating': mood_rating,
    'sleep_hours': sleep_hours,
    'substance_use': substance_use,
    'social_interactions': np.random.randint(0, 10, size=n_samples),
    'coping_mechanisms': coping_mechanisms,
    'anxiety_levels': anxiety_levels,
    'stress_levels': stress_levels,
    'goal_achievement': goal_achievement,
    'satisfaction_with_treatment': np.clip(np.random.normal(loc=7, scale=1.5, size=n_samples), 1, 10)
}



df = pd.DataFrame(data)

physical_activity = np.where(
    (mood_rating > 6) & (stress_levels < 5),
    np.random.choice(['high', 'moderate', 'low'], n_samples, p=[0.5, 0.4, 0.1]),
    np.random.choice(['moderate', 'low'], n_samples, p=[0.3, 0.7])
)
df['physical_activity'] = physical_activity



eating_habits = np.where(
    (anxiety_levels > 6) | (stress_levels > 6) | (substance_use == 'frequent'),
    np.random.choice(['unhealthy', 'moderate', 'healthy'], n_samples, p=[0.6, 0.3, 0.1]),
    np.random.choice(['healthy', 'moderate', 'unhealthy'], n_samples, p=[0.5, 0.4, 0.1])
)

df['eating_habits'] = eating_habits




mental_health_checkups = np.where(
    (diagnosis_sampled != 'none') | (stress_levels > 6),
    np.random.choice(['frequent', 'occasional', 'rare'], n_samples, p=[0.6, 0.3, 0.1]),
    np.random.choice(['occasional', 'rare'], n_samples, p=[0.2, 0.8])
)

df['mental_health_checkups'] = mental_health_checkups



work_life_balance = np.where(
    (occupations_sampled == 'employed') & (ages < 40) & (stress_levels < 5),
    np.random.choice(['good', 'moderate', 'poor'], n_samples, p=[0.6, 0.3, 0.1]),
    np.random.choice(['poor', 'moderate', 'good'], n_samples, p=[0.5, 0.4, 0.1])
)

df['work_life_balance'] = work_life_balance


digital_device_usage = np.where(
    (ages < 30) | (mood_rating < 5),
    np.clip(np.random.normal(loc=7, scale=1.5, size=n_samples), 2, 10),
    np.clip(np.random.normal(loc=5, scale=1.5, size=n_samples), 2, 10)  
)

df['digital_device_usage'] = digital_device_usage



exercise_frequency = np.where(
    (physical_activity == 'high') | (coping_mechanisms == 'exercise'),
    np.random.choice(['daily', 'several times a week', 'rarely'], n_samples, p=[0.5, 0.4, 0.1]),
    np.random.choice(['several times a week', 'rarely'], n_samples, p=[0.3, 0.7])
)

df['exercise_frequency'] = exercise_frequency


relationship_satisfaction = np.where(
    (marital_status_sampled == 'married') & (mood_rating > 6),
    np.random.choice(['satisfied', 'neutral', 'dissatisfied'], n_samples, p=[0.7, 0.2, 0.1]),
    np.random.choice(['neutral', 'dissatisfied', 'satisfied'], n_samples, p=[0.5, 0.4, 0.1])
)

df['relationship_satisfaction'] = relationship_satisfaction



financial_stress = np.where(
    (income == 'Low') | (occupations_sampled == 'unemployed'),
    np.random.choice(['high', 'moderate', 'low'], n_samples, p=[0.6, 0.3, 0.1]),
    np.random.choice(['low', 'moderate', 'high'], n_samples, p=[0.6, 0.3, 0.1])
)

df['financial_stress'] = financial_stress



social_media_usage = np.where(
    (ages < 30) | (anxiety_levels > 6),
    np.clip(np.random.normal(loc=4, scale=1, size=n_samples), 0, 8),  
    np.clip(np.random.normal(loc=2, scale=1, size=n_samples), 0, 8)  
)


social_interactions = np.where(
    (df['mood_rating'] > 6),
    np.clip(np.random.normal(loc=7, scale=2, size=n_samples), 0, 15),  
    np.clip(np.random.normal(loc=3, scale=2, size=n_samples), 0, 15)   
)


satisfaction_with_treatment = np.where(
   (df['mental_health_checkups'] == 'frequent') & (df['diagnosis'] != 'none'),
    np.clip(np.random.normal(loc=8, scale=1, size=n_samples), 1, 10),  
    np.clip(np.random.normal(loc=5, scale=1.5, size=n_samples), 1, 10) 
)


goal_achievement = np.where(
    (df['anxiety_levels'] < 5) & (df['stress_levels'] < 5),
    'yes',  
    'no'    
)


sleep_hours = np.where(
    (df['substance_use'] == 'none') & (df['mood_rating'] > 6) & (df['stress_levels'] < 5),
    np.clip(np.random.normal(loc=8, scale=1, size=n_samples), 4, 10),  
    np.where((df['substance_use'] == 'frequent') | (df['stress_levels'] > 7),
             np.clip(np.random.normal(loc=5, scale=1, size=n_samples), 3, 9),  
             np.clip(np.random.normal(loc=6.5, scale=1, size=n_samples), 4, 10))  
)


df['social_interactions'] = social_interactions
df['satisfaction_with_treatment'] = satisfaction_with_treatment
df['goal_achievement'] = goal_achievement
df['sleep_hours'] = sleep_hours




coping_mechanisms = np.where(
  (df['anxiety_levels'] < 5) & (df['mood_rating'] > 6),
    np.random.choice(['exercise', 'meditation', 'podcast'], n_samples),  
    np.where(df['anxiety_levels'] > 7,
             np.random.choice(['substance use', 'journaling', 'therapy'], n_samples),  
             np.random.choice(['therapy', 'journaling'], n_samples))  
)


mental_health_checkups = np.where(
    (df['anxiety_levels'] > 6) | (df['diagnosis'] != 'none') | (df['satisfaction_with_treatment'] < 5),
    np.random.choice(['frequent', 'occasional'], n_samples),  
    np.random.choice(['rare', 'occasional'], n_samples) 
)


diagnosis_sampled = np.where(
     (df['stress_levels'] > 6),
    np.random.choice(['depression', 'anxiety', 'PTSD','none'], n_samples),  
    np.where(df['substance_use'] == 'frequent',
             np.random.choice(['substance abuse', 'anxiety','none'], n_samples),  
             np.random.choice(['none', 'mild anxiety'], n_samples))  
)


mood_rating = np.where(
    (df['diagnosis'] == 'depression') | (df['coping_mechanisms'] == 'substance use'),
    np.clip(np.random.normal(loc=3, scale=1, size=n_samples), 1, 10), 
    np.where(df['goal_achievement'] == 'yes',
             np.clip(np.random.normal(loc=8, scale=1, size=n_samples), 5, 10), 
             np.clip(np.random.normal(loc=5, scale=1, size=n_samples), 1, 10)) 
)


goal_achievement = np.where(
    (df['social_interactions'] > 5) & (df['marital_status'] == 'married') & (df['income_level'] == 'High'),
    'yes',  
    'no'   
)


income = np.where(
    (ages > 30) & (df['education_level'] == 'Bachelor') & (df['occupation'] == 'employed'),
    'High', 
    np.where((ages < 25) & (df['education_level'] == 'High School') & (df['occupation'] != 'employed'),
             'No income', 
             np.random.choice(['Low', 'Medium'], n_samples))  
)


df['coping_mechanisms'] = coping_mechanisms
df['mental_health_checkups'] = mental_health_checkups
df['diagnosis'] = diagnosis_sampled
df['mood_rating'] = mood_rating
df['goal_achievement'] = goal_achievement
df['income_level'] = income


df.to_csv('synthetic_mental_health_data_correlated.csv', index=False)


  sleep_hours[i] = np.clip(np.random.normal(loc=5, scale=1, size=1), 2, 8)
  sleep_hours[i] = np.clip(np.random.normal(loc=6, scale=1, size=1), 3, 9)
  sleep_hours[i] = np.clip(np.random.normal(loc=8, scale=1, size=1), 4, 10)


In [280]:
df=pd.read_csv('synthetic_mental_health_data_correlated.csv')
df

Unnamed: 0,patient_id,age,gender,occupation,education_level,marital_status,income_level,diagnosis,medications,mood_rating,...,goal_achievement,satisfaction_with_treatment,physical_activity,eating_habits,mental_health_checkups,work_life_balance,digital_device_usage,exercise_frequency,relationship_satisfaction,financial_stress
0,1,56,female,unemployed,Master,married,Medium,depression,Benzo,5.701849,...,no,6.820960,moderate,moderate,frequent,moderate,5.461703,rarely,satisfied,moderate
1,2,46,male,employed,Doctorate,divorced,Low,mild anxiety,,9.307484,...,no,2.813673,low,moderate,occasional,poor,2.434747,rarely,dissatisfied,moderate
2,3,32,male,unemployed,Bachelor,married,Low,none,Benzo,5.654492,...,no,7.535383,low,moderate,frequent,poor,9.823037,rarely,neutral,low
3,4,60,female,employed,Bachelor,married,High,none,Mood stabilizer,5.005834,...,no,3.871765,low,moderate,frequent,poor,6.114896,rarely,dissatisfied,low
4,5,25,non-binary,employed,Doctorate,single,Low,none,Benzo,3.313538,...,no,8.283288,low,moderate,frequent,poor,9.389780,several times a week,satisfied,moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,24,female,unemployed,Bachelor,single,Medium,PTSD,Mood stabilizer,4.737704,...,no,3.791195,low,unhealthy,frequent,moderate,8.654953,several times a week,dissatisfied,moderate
96,97,26,non-binary,employed,Master,married,Low,mild anxiety,SSRI,5.373481,...,no,6.593683,low,moderate,frequent,good,7.171341,rarely,satisfied,moderate
97,98,41,male,unemployed,Master,married,Low,mild anxiety,,6.642209,...,no,7.916894,low,healthy,frequent,moderate,6.334446,rarely,dissatisfied,moderate
98,99,18,male,unemployed,High School,single,No income,anxiety,SSRI,5.433881,...,no,3.148477,low,moderate,occasional,moderate,6.454582,rarely,neutral,low


In [325]:
df['stress_levels']

0      9.103253
1      4.599093
2      5.831929
3      5.285175
4      3.528080
        ...    
95    10.000000
96     5.560226
97     4.087872
98     6.997115
99     1.764681
Name: stress_levels, Length: 100, dtype: float64

In [260]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [281]:
print("Dataset Info:")
df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   patient_id                   100 non-null    int64  
 1   age                          100 non-null    int64  
 2   gender                       100 non-null    object 
 3   occupation                   100 non-null    object 
 4   education_level              100 non-null    object 
 5   marital_status               100 non-null    object 
 6   income_level                 100 non-null    object 
 7   diagnosis                    100 non-null    object 
 8   medications                  76 non-null     object 
 9   mood_rating                  100 non-null    float64
 10  sleep_hours                  100 non-null    float64
 11  substance_use                100 non-null    object 
 12  social_interactions          100 non-null    float64
 13  coping_

In [276]:
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values[missing_values > 0])

Missing Values per Column:
medications    24
dtype: int64


In [282]:
df.columns
df.fillna('unknown',inplace=True)
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values[missing_values > 0])

Missing Values per Column:
Series([], dtype: int64)


In [283]:
num_cols = df._get_numeric_data().columns
num_cols

Index(['patient_id', 'age', 'mood_rating', 'sleep_hours',
       'social_interactions', 'anxiety_levels', 'stress_levels',
       'satisfaction_with_treatment', 'digital_device_usage'],
      dtype='object')

In [211]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [316]:
df.columns

Index(['patient_id', 'age', 'gender', 'occupation', 'education_level',
       'marital_status', 'income_level', 'diagnosis', 'medications',
       'mood_rating', 'sleep_hours', 'substance_use', 'social_interactions',
       'coping_mechanisms', 'anxiety_levels', 'stress_levels',
       'goal_achievement', 'satisfaction_with_treatment', 'physical_activity',
       'eating_habits', 'mental_health_checkups', 'work_life_balance',
       'digital_device_usage', 'exercise_frequency',
       'relationship_satisfaction', 'financial_stress'],
      dtype='object')

In [None]:


categorical_columns = [
    'gender', 'occupation', 'education_level', 'marital_status', 
    'income_level', 'coping_mechanisms', 'physical_activity', 
]

numerical_columns = [
   'age',  'sleep_hours', 'stress_levels'
]



preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)
preprocessor.fit_transform(df)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', NearestNeighbors(n_neighbors=5, algorithm='auto'))
])


X=df.drop(columns=['patient_id','medications','substance_use','anxiety_levels','satisfaction_with_treatment','exercise_frequency','digital_device_usage','mood_rating','diagnosis','mental_health_checkups','work_life_balance','eating_habits','relationship_satisfaction','financial_stress'])


pipeline.fit(X)


user_data = {
    'patient_id': 1, 'age': 30, 'mood_rating': 7, 'sleep_hours': 6,
    'social_interactions': 5, 'anxiety_levels': 5,
    'stress_levels': 6, 'satisfaction_with_treatment': 7, 'digital_device_usage': 3,
    'gender': 'male', 'occupation': 'employed', 'education_level': 'Bachelor',
    'marital_status': 'single', 'income_level': 'Medium', 'diagnosis': 'anxiety',
    'substance_use': 'occasional', 'coping_mechanisms': 'exercise', 
    'goal_achievement': 'yes', 'physical_activity': 'low', 'eating_habits': 'healthy', 'mental_health_checkups': 'occasional',
    'work_life_balance': 'moderate', 'exercise_frequency': 'rarely', 
    'relationship_satisfaction': 'dissatisfied', 'financial_stress': 'moderate'
}


user_df = pd.DataFrame([user_data])

for col in numerical_columns:
    user_df[col] = pd.to_numeric(df[col], errors='coerce')  


user_df = user_df.dropna(subset=numerical_columns)  
user_df = user_df.fillna('Unknown') 


for col in categorical_columns:
    user_df[col] = user_df[col].astype(str)




user_transformed = pipeline.named_steps['preprocessor'].transform(user_df)
distances, indices = pipeline.named_steps['model'].kneighbors(user_transformed)
recommended_users = df.iloc[indices[0]]

print("Recommended Users:")
print(recommended_users.head(2))

Recommended Users:
    patient_id  age gender  occupation education_level marital_status  \
34          35   64   male  unemployed        Bachelor        married   
27          28   61   male  unemployed        Bachelor       divorced   

   income_level   diagnosis medications  mood_rating  ...  goal_achievement  \
34       Medium  depression        SSRI     5.041723  ...                no   
27       Medium        none       Benzo     5.249012  ...                no   

   satisfaction_with_treatment  physical_activity eating_habits  \
34                    8.709452                low     unhealthy   
27                    5.220190                low      moderate   

    mental_health_checkups  work_life_balance digital_device_usage  \
34                frequent               poor            10.000000   
27              occasional           moderate             3.866926   

    exercise_frequency relationship_satisfaction financial_stress  
34              rarely              dissat

In [322]:
import joblib
from joblib import dump
pipeline.fit(X)

dump(pipeline, 'model.pkl')
dump(preprocessor,'preprocessor.pkl')


['preprocessor.pkl']

In [323]:
pipeline = joblib.load('model.pkl')
pipeline