In [23]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 20000

# Generate columns with stronger correlations
age = np.random.normal(50, 12, n_samples).clip(18, 85)  # Age
chronic_disease_type = np.random.choice(['Diabetes', 'Heart Disease', 'Kidney Disease', 'Arthritis'], size=n_samples)
disease_severity = np.random.choice(['Mild', 'Moderate', 'Severe'], size=n_samples, p=[0.5, 0.35, 0.15])
duration_of_illness = np.random.normal(10, 5, n_samples).clip(1, 30)
diet_quality = np.random.choice(['Poor', 'Average', 'Good'], size=n_samples, p=[0.2, 0.5, 0.3])
exercise_frequency = np.random.normal(3, 1.5, n_samples).clip(0, 7)
social_engagement = np.random.choice(['Low', 'Medium', 'High'], size=n_samples, p=[0.3, 0.5, 0.2])
sleep_quality = np.random.choice(['Poor', 'Average', 'Good'], size=n_samples, p=[0.2, 0.5, 0.3])

# Correlate features with Baseline PHQ-9 Score
baseline_phq9_score = (
    15
    + (age / 85 * np.random.uniform(1, 3, n_samples))  # Age correlates with baseline score
    + (np.isin(chronic_disease_type, ['Kidney Disease', 'Heart Disease']).astype(int) * np.random.uniform(2, 4, n_samples))  # Specific diseases impact score
    + (disease_severity == 'Severe').astype(int) * np.random.uniform(3, 5, n_samples)  # Severe disease increases score
    + (duration_of_illness / 30 * np.random.uniform(1, 3, n_samples))  # Longer illness duration increases score
    - (diet_quality == 'Good').astype(int) * np.random.uniform(2, 4, n_samples)  # Good diet reduces score
    - (exercise_frequency / 7 * np.random.uniform(2, 4, n_samples))  # More exercise reduces score
    - (social_engagement == 'High').astype(int) * np.random.uniform(2, 4, n_samples)  # High social engagement reduces score
    - (sleep_quality == 'Good').astype(int) * np.random.uniform(2, 4, n_samples)  # Good sleep reduces score
).clip(0, 27)  # PHQ-9 scale (0-27)

# Correlate features with Follow-up PHQ-9 Score
followup_phq9_score = baseline_phq9_score - (np.random.normal(3, 1.5, n_samples)).clip(0, 5)

# Convert follow-up PHQ-9 scores to categories
def categorize_phq9(score):
    if score <= 4:
        return 'Minimal'
    elif score <= 9:
        return 'Mild'
    elif score <= 14:
        return 'Moderate'
    elif score <= 19:
        return 'Moderately Severe'
    else:
        return 'Severe'

# Generate text-based symptoms
symptoms = np.random.choice(['mild anxiety', 'moderate anxiety', 'severe anxiety', 'mild depression', 'moderate depression', 'severe depression', 
                             'mild stress', 'moderate stress', 'severe stress'], size=n_samples)

# Generate demographic features
demographic_info = np.random.choice(['single', 'married', 'divorced', 'employed', 'unemployed'], size=n_samples)

# Create DataFrame
df = pd.DataFrame({
    'Age': np.round(age, 0).astype(int),
    'Chronic_Disease_Type': chronic_disease_type,
    'Disease_Severity': disease_severity,
    'Duration_of_Illness_Years': np.round(duration_of_illness, 1),
    'Diet_Quality': diet_quality,
    'Exercise_Frequency_Per_Week': np.round(exercise_frequency, 1),
    'Social_Engagement': social_engagement,
    'Sleep_Quality': sleep_quality,
    'Baseline_PHQ9_Score': np.round(baseline_phq9_score, 1),
    'Followup_PHQ9_Score': np.round(followup_phq9_score, 1),
    'Symptoms': symptoms,
    'Demographic_Info': demographic_info
})

# Apply categorization to Followup_PHQ9_Score
df['Followup_PHQ9_Score'] = df['Followup_PHQ9_Score'].apply(categorize_phq9)


# Save the cleaned dataset to a CSV file
df.to_csv("cleaned_depression_dataset.csv", index=False)

print("Updated dataset with cleaned data has been saved as 'cleaned_depression_dataset.csv'.")


Updated dataset with cleaned data has been saved as 'cleaned_depression_dataset.csv'.


In [20]:
df

Unnamed: 0,Age,Chronic_Disease_Type,Disease_Severity,Duration_of_Illness_Years,Diet_Quality,Exercise_Frequency_Per_Week,Social_Engagement,Sleep_Quality,Baseline_PHQ9_Score,Followup_PHQ9_Score,Symptoms,Demographic_Info
0,56,Arthritis,Mild,12.8,Poor,4.5,Low,Average,13.5,Moderate,moderate stress,single
1,48,Diabetes,Severe,8.4,Good,3.5,Medium,Average,16.0,Moderate,moderate stress,divorced
2,58,Heart Disease,Moderate,12.8,Average,3.7,Low,Poor,13.9,Moderate,severe depression,unemployed
3,68,Diabetes,Mild,14.9,Average,3.7,High,Average,14.4,Moderate,severe stress,single
4,47,Arthritis,Severe,5.6,Good,0.4,Medium,Good,11.8,Moderate,moderate stress,married
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,55,Heart Disease,Severe,6.0,Average,0.0,High,Good,13.6,Moderate,moderate anxiety,married
19996,67,Kidney Disease,Mild,14.7,Average,2.2,Low,Average,20.8,Moderately Severe,mild depression,employed
19997,55,Arthritis,Mild,14.0,Poor,2.4,Low,Average,15.2,Moderate,mild anxiety,unemployed
19998,71,Kidney Disease,Mild,9.5,Average,2.1,Medium,Average,21.2,Moderately Severe,severe stress,single


In [None]:
# Ensure strong correlations and remove outliers
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Remove outliers from key columns
df_clean = remove_outliers(df, ['Age', 'Duration_of_Illness_Years', 'Baseline_PHQ9_Score', 'Followup_PHQ9_Score'])