In [1]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("cleaned_dataset.csv")
continuous_cols = ['Amount', 'Area', 'Price']
df[continuous_cols] = StandardScaler().fit_transform(df[continuous_cols])

X_cont = df[continuous_cols].values
X = df.drop('Price', axis=1).values 
y = df['Price'].values 
print(df.columns)

Index(['Bathroom', 'Balcony', 'Car_Parking', 'Amount', 'Price', 'Area',
       'Remaining_Floor', 'Garden/Park', 'Pool', 'Main Road', 'BHK',
       'Location_Other', 'Location_ahmedabad', 'Location_bangalore',
       'Location_chennai', 'Location_faridabad', 'Location_greater-noida',
       'Location_gurgaon', 'Location_hyderabad', 'Location_jaipur',
       'Location_kolkata', 'Location_new-delhi', 'Location_pune',
       'Location_surat', 'Location_thane', 'Location_vadodara',
       'Transaction_New Property', 'Transaction_Other', 'Transaction_Resale',
       'Furnishing_Furnished', 'Furnishing_Semi-Furnished',
       'Furnishing_Unfurnished', 'Facing_East', 'Facing_North',
       'Facing_North - East', 'Facing_North - West', 'Facing_South',
       'Facing_South - East', 'Facing_South - West', 'Facing_Unknown',
       'Facing_West'],
      dtype='object')


In [3]:
#train-test split
n = X.shape[0]
indices = np.random.permutation(n)
train_size = int(0.8 * n)
train_idx, test_idx = indices[:train_size], indices[train_size:]

X_train_cont, X_test_cont = X_cont[train_idx], X_cont[test_idx]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

In [4]:
# E-step: Fit Gaussian Mixture, tính responsibility cho từng điểm
n_components = 5
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(X_train_cont)
respons_train = gmm.predict_proba(X_train_cont)   
respons_test = gmm.predict_proba(X_test_cont)     

# M-step: Weighted OLS cho từng cluster
from sklearn.linear_model import LinearRegression
def fit_weighted_linear(X, y, w):
    reg = LinearRegression()
    reg.fit(X, y, sample_weight=w)
    beta = np.concatenate(([reg.intercept_], reg.coef_))
    return beta

# Với mỗi cluster k, fit một hồi quy tuyến tính sử dụng responsibility làm trọng số
betas = [fit_weighted_linear(X_train, y_train, respons_train[:, k]) for k in range(n_components)]
betas = np.stack(betas)  

In [5]:
def predict_mixture(X, respons, betas):
    Xb = np.hstack([np.ones((X.shape[0], 1)), X])
    all_preds = Xb @ betas.T
    return np.sum(respons * all_preds, axis=1)

def evaluate_regression(y_true, y_pred, name=''):
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{name} RMSE: {rmse:.4f} | MAE: {mae:.4f} | R²: {r2:.4f}")

y_pred_train = predict_mixture(X_train, respons_train, betas)
y_pred_test  = predict_mixture(X_test, respons_test, betas)

#evaluate_regression(y_train, y_pred_train, 'Train')
evaluate_regression(y_test, y_pred_test, 'Test')



Test RMSE: 0.3068 | MAE: 0.1753 | R²: 0.9059
