In [2]:
import pandas as pd
import numpy as np
import random
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

In [3]:
# ========== STEP 1: Enhanced Parameter Definitions ==========
parameters = {
    # Core parameters (directly from questions)
    "sleep": ["Early", "On-time", "Night owl"],
    "diet": ["Vegetarian", "Vegan", "Flexitarian", "Non-vegetarian", "Eggetarian"],
    "sharing_comfort": ["Not comfortable", "Somewhat okay", "Very open"],
    "cleanliness": [1, 2, 3, 4, 5],
    "sociability": ["Quiet", "Balanced", "Social/Chill"],
    
    # Derived parameters (from question interpretation)
    "noise_tolerance": ["Low", "Medium", "High"],
    "conflict_style": ["Passive", "Passive-aggressive", "Assertive"],
    "boundary_strength": ["Rigid", "Moderate", "Flexible"],
    "stress_response": ["Organized", "Procrastinator", "Reactive"],
    "collaboration": ["Solo", "Selective", "Team"],
    "communication": ["Indirect", "Mixed", "Direct"],
    "autonomy": ["High", "Medium", "Low"]
}

In [4]:
# ========== STEP 2: Synthetic Data Generation ==========
def generate_user(id):
    user = {"id": f"U{id:03d}"}
    
    # Core traits with dependencies
    user["sleep"] = random.choices(parameters["sleep"], weights=[30, 40, 30])[0]
    
    # Diet influenced by sleep pattern (Indians are 40% vegetarian)
    if user["sleep"] == "Early":
        user["diet"] = random.choices(parameters["diet"], weights=[40, 20, 15, 15, 10])[0]
    else:
        user["diet"] = random.choices(parameters["diet"], weights=[25, 15, 20, 30, 10])[0]
    
    # Cleanliness (Vegetarians tend to be cleaner)
    if user["diet"] in ["Vegetarian", "Vegan", "Eggetarian"]:
        user["cleanliness"] = random.choices(parameters["cleanliness"], weights=[5, 15, 25, 35, 20])[0]
    else:
        user["cleanliness"] = random.choices(parameters["cleanliness"], weights=[15, 25, 30, 20, 10])[0]
    
    # Sociability and noise tolerance
    if user["sleep"] == "Early":
        user["sociability"] = random.choices(parameters["sociability"], weights=[50, 35, 15])[0]
        user["noise_tolerance"] = "Low"
    elif user["sleep"] == "Night owl":
        user["sociability"] = random.choices(parameters["sociability"], weights=[10, 30, 60])[0]
        user["noise_tolerance"] = "High"
    else:
        user["sociability"] = random.choices(parameters["sociability"], weights=[20, 50, 30])[0]
        user["noise_tolerance"] = "Medium"
    
    # Sharing comfort based on diet rigidity
    if user["diet"] in ["Vegan", "Vegetarian"]:
        user["sharing_comfort"] = random.choices(parameters["sharing_comfort"], weights=[60, 30, 10])[0]
        user["boundary_strength"] = "Rigid"
    else:
        user["sharing_comfort"] = random.choices(parameters["sharing_comfort"], weights=[20, 50, 30])[0]
        user["boundary_strength"] = random.choice(["Moderate", "Flexible"])
    
    # Stress response (cleanliness correlates)
    if user["cleanliness"] >= 4:
        user["stress_response"] = "Organized"
    elif user["cleanliness"] <= 2:
        user["stress_response"] = "Reactive"
    else:
        user["stress_response"] = "Procrastinator"
    
    # Conflict style (sociability influences)
    if user["sociability"] == "Quiet":
        user["conflict_style"] = random.choices(["Passive", "Passive-aggressive"], weights=[70, 30])[0]
    else:
        user["conflict_style"] = "Assertive"
    
    # Collaboration preference
    if user["sociability"] == "Social/Chill":
        user["collaboration"] = "Team"
        user["autonomy"] = "Low"
    else:
        user["collaboration"] = random.choice(["Solo", "Selective"])
        user["autonomy"] = "High"
    
    return user

# Generate 500 fake users
data = [generate_user(i) for i in range(1, 501)]
df = pd.DataFrame(data)

In [None]:
df.to_csv(r'C:\Users\Lenovo\Documents\Hackathon\Nivasa\roommate-ai-backend\data\modelData.csv', index=False)