In [6]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from math import sqrt

# Load original user dataset
df = pd.read_csv("Dating App Dataset.csv")

# Drop User ID
user_profiles = df.drop(columns=["User ID"]).reset_index(drop=True)


# ---------- Utility Functions ----------

def jaccard_similarity(list1, list2):
    set1, set2 = set(eval(list1)), set(eval(list2))
    if not set1 or not set2:
        return 0
    return len(set1 & set2) / len(set1 | set2)

def education_similarity(e1, e2):
    levels = ['High School', "Bachelor's Degree", "Master's Degree", 'Ph.D.']
    try:
        return 1 - abs(levels.index(e1) - levels.index(e2)) / len(levels)
    except ValueError:
        return 0.5

def occupation_similarity(o1, o2):
    return 1 if o1 == o2 else 0

def calculate_compatibility(user1, user2):
    score = 0
    if user1['Looking For'] == user2['Looking For']:
        score += 20
    if abs(user1['Age'] - user2['Age']) <= 5:
        score += 10
    score += jaccard_similarity(user1['Interests'], user2['Interests']) * 30
    score += education_similarity(user1['Education Level'], user2['Education Level']) * 10
    if user1['Frequency of Usage'] == user2['Frequency of Usage']:
        score += 10
    if user1['Children'] == user2['Children']:
        score += 10
    score += occupation_similarity(user1['Occupation'], user2['Occupation']) * 5
    if abs(user1['Height'] - user2['Height']) < 0.5:
        score += 5
    return round(score, 2)

# ---------- Generate Synthetic Pairs ----------

num_pairs = 2000
pairs = []

for _ in range(num_pairs):
    idx1, idx2 = random.sample(range(len(user_profiles)), 2)
    u1 = user_profiles.iloc[idx1]
    u2 = user_profiles.iloc[idx2]
    compatibility = calculate_compatibility(u1, u2)

    pairs.append({
        "User1_ID": idx1,
        "User2_ID": idx2,
        "User1_Gender": u1["Gender"],
        "User2_Gender": u2["Gender"],
        "User1_Age": u1["Age"],
        "User2_Age": u2["Age"],
        "User1_Interests": u1["Interests"],
        "User2_Interests": u2["Interests"],
        "User1_LookingFor": u1["Looking For"],
        "User2_LookingFor": u2["Looking For"],
        "User1_Children": u1["Children"],
        "User2_Children": u2["Children"],
        "User1_Education": u1["Education Level"],
        "User2_Education": u2["Education Level"],
        "User1_Occupation": u1["Occupation"],
        "User2_Occupation": u2["Occupation"],
        "User1_Usage": u1["Frequency of Usage"],
        "User2_Usage": u2["Frequency of Usage"],
        "User1_Height": u1["Height"],
        "User2_Height": u2["Height"],
        "CompatibilityScore": compatibility
    })

pairs_df = pd.DataFrame(pairs)

# ---------- Preprocess & Train Model ----------

# Define target and features
features = pairs_df.drop(columns=["User1_ID", "User2_ID", "CompatibilityScore"])
target = pairs_df["CompatibilityScore"]

# Columns
categorical_cols = [col for col in features.columns if "Gender" in col or
                    "LookingFor" in col or "Children" in col or "Education" in col or
                    "Occupation" in col or "Usage" in col]
numerical_cols = [col for col in features.columns if "Age" in col or "Height" in col]

# Interest similarity as a feature
def transform_interests(df):
    similarities = []
    for i in range(len(df)):
        similarities.append(jaccard_similarity(df.iloc[i]["User1_Interests"], df.iloc[i]["User2_Interests"]))
    return pd.DataFrame({"Interest_Similarity": similarities})

# Create interest similarity
interest_similarity = transform_interests(features)
features_model = features.drop(columns=["User1_Interests", "User2_Interests"])
features_model = pd.concat([features_model.reset_index(drop=True), interest_similarity], axis=1)

# Preprocessor (Fix here)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ("num", "passthrough", numerical_cols + ["Interest_Similarity"])
    ]
)

Pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),   # includes OneHotEncoder
    ("regressor", RandomForestRegressor())
])


# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(features_model, target, test_size=0.2, random_state=42)

# Train
model_pipeline.fit(X_train, y_train)

# Evaluate
y_pred = model_pipeline.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"✅ Model trained successfully. RMSE: {rmse:.2f}")

import joblib
joblib.dump(model_pipeline, "model.pkl")

✅ Model trained successfully. RMSE: 11.66


['model.pkl']