In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer

In [2]:
#โหลดข้อมูล
data = pd.read_csv("train_cleaned.csv")

In [7]:
#กำหนด features
features = [
    "OverallQual",
    "TotalBsmtSF",
    "LotArea",
    "GarageCars",
    "Fireplaces",
    "BedroomAbvGr",
    "GrLivArea",
    "FullBath",
    "Neighborhood"
  ]
target = 'SalePrice'

#ถ้าบางคอลัมน์เป็น category แปลงเป็นตัวเลข
for col in ['Neighborhood']:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category').cat.codes

X = data[features]
y = data[target]

# สร้างโมเดลทำนายราคาบ้าน
print("🎯 Training Price Model...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
price_model = RandomForestRegressor(n_estimators=200, random_state=42)
price_model.fit(X_train, y_train)

#บันทึกโมเดล
joblib.dump(price_model, "ml_model/price_model.joblib")
print("✅ Saved: price_model.joblib")

# สร้าง Reverse Models (เดา feature จากราคา)
#print("🎯 Training Reverse Models...")
#for f in features:
#    rev_X = y.values.reshape(-1, 1)  # ใช้ SalePrice เป็น input
#    rev_y = X[f].values
#    rev_model = LinearRegression()
#    rev_model.fit(rev_X, rev_y)
#    filename = f"ml_model/reverse_{f}.joblib"
#    joblib.dump(rev_model, filename)
#    print(f"✅ Saved: {filename}")

#สร้าง Imputer Model (เติมค่าที่ขาด)
print("Training Imputer Model...")
imputer = KNNImputer(n_neighbors=5)
imputer.fit(X)
joblib.dump(imputer, "ml_model/imputer_model.joblib")
print("Saved: imputer_model.joblib")

print("🎉 All models trained and saved successfully!")

🎯 Training Price Model...
✅ Saved: price_model.joblib
Training Imputer Model...
Saved: imputer_model.joblib
🎉 All models trained and saved successfully!


In [9]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score

# ===========================================================
# (Price Model)
# ===========================================================
print("\n Testing Price Model...")
price_model = joblib.load("ml_model/price_model.joblib")

y_pred = price_model.predict(X)

mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f" Mean Absolute Error (MAE): {mae:,.2f}")
print(f" R² Score: {r2:.3f}")

# ตัวอย่างทำนาย 1 บ้าน
sample = X.iloc[0:1]
pred_price = price_model.predict(sample)[0]
print("\n Example Prediction:")
print("Input features:", sample.to_dict(orient='records')[0])
print(f"Predicted SalePrice: {pred_price:,.2f}")
print(f"Actual SalePrice: {y.iloc[0]:,.2f}")

# ===========================================================
#  Reverse Models (ทำนาย features จากราคา)
# ===========================================================
#print("\n Testing Reverse Models...")
#for f in features:
#    reverse_model = joblib.load(f"ml_model/reverse_{f}.joblib")
#    sample_price = [[200000]]  # ทดสอบด้วยราคาบ้าน 200,000
#    predicted_value = reverse_model.predict(sample_price)[0]
#   print(f"{f}: {predicted_value:.2f}")

# ===========================================================
#  Imputer
# ===========================================================
print("\n Testing Imputer Model...")
imputer = joblib.load("ml_model/imputer_model.joblib")

# สร้างข้อมูลที่มี missing value
sample_with_missing = X.iloc[0:1].copy()
sample_with_missing.iloc[0, [2, 5]] = None  # ลบ GrLivArea และ FullBath

print("Before imputing:")
print(sample_with_missing)

filled = imputer.transform(sample_with_missing)
filled_df = pd.DataFrame(filled, columns=X.columns)

print("\nAfter imputing:")
print(filled_df)



 Testing Price Model...
 Mean Absolute Error (MAE): 9,703.72
 R² Score: 0.956

 Example Prediction:
Input features: {'OverallQual': 7, 'TotalBsmtSF': 856, 'LotArea': 8450, 'GarageCars': 2, 'Fireplaces': 0, 'BedroomAbvGr': 3, 'GrLivArea': 1710, 'FullBath': 2, 'Neighborhood': 5}
Predicted SalePrice: 201,029.75
Actual SalePrice: 208,500.00

 Testing Imputer Model...
Before imputing:
   OverallQual  TotalBsmtSF  LotArea  GarageCars  Fireplaces  BedroomAbvGr  \
0            7          856      NaN           2           0           NaN   

   GrLivArea  FullBath  Neighborhood  
0       1710         2             5  

After imputing:
   OverallQual  TotalBsmtSF  LotArea  GarageCars  Fireplaces  BedroomAbvGr  \
0          7.0        856.0   9379.8         2.0         0.0           3.2   

   GrLivArea  FullBath  Neighborhood  
0     1710.0       2.0           5.0  


# Testing

In [15]:
import pandas as pd
import numpy as np
import joblib

# -----------------------------
# 1 โหลดโมเดล
# -----------------------------
#price_model = joblib.load("ml_model/price_model.joblib")
imputer_model = joblib.load("ml_model/imputer_model.joblib")

features = [
    "OverallQual",
    "TotalBsmtSF",
    "LotArea",
    "GarageCars",
    "Fireplaces",
    "BedroomAbvGr",
    "GrLivArea",
    "FullBath",
    "Neighborhood"
]

# คอลัมน์ที่เป็น integer ต้องปัด
int_features = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath"
]

#reverse_models = {f: joblib.load(f"ml_model/reverse_{f}.joblib") for f in features}

# -----------------------------
# 2 Mapping Neighborhood
# -----------------------------
neighborhood_mapping = {
    0: "Unknown",
    1: "Bloomington Heights",
    2: "Bluestem",
    3: "Briardale",
    4: "Brookside",
    5: "Clear Creek",
    6: "College Creek",
    7: "Crawford",
    8: "Edwards",
    9: "Gilbert",
    10: "Iowa DOT and Rail Road",
    11: "Meadow Village",
    12: "Mitchell",
    13: "North Ames",
    14: "Northridge",
    15: "Northpark Villa",
    16: "Northridge Heights",
    17: "Northwest Ames",
    18: "Old Town",
    19: "South & West of Iowa State University",
    20: "Sawyer",
    21: "Sawyer West",
    22: "Somerset",
    23: "Stone Brook",
    24: "Timberland",
    25: "Veenker"
}

# -----------------------------
# 3 ข้อมูลจากผู้ใช้ (กรอกไม่ครบ)
# -----------------------------
user_input = {
    "OverallQual": 7,
    "GrLivArea": 1500,
    "GarageCars": None,
    "TotalBsmtSF": None,
    "FullBath": 2,
    "Neighborhood": None,
    "LotArea": 8000,
    "Fireplaces": None,
    "BedroomAbvGr": None,
    "SalePrice": None
}

input_df = pd.DataFrame([user_input])

# -----------------------------
# 4 Imputer Prediction
# -----------------------------
filled_imputer = input_df.drop(columns=["SalePrice"])
filled_imputer = filled_imputer[features]  # จัดลำดับคอลัมน์

# เติมค่าที่ขาดด้วย Imputer
filled_imputer_array = imputer_model.transform(filled_imputer)
filled_imputer = pd.DataFrame(filled_imputer_array, columns=features)

# ปัด integer ฟีเจอร์
for col in int_features:
    filled_imputer[col] = filled_imputer[col].round().astype('Int64')

# เติมค่า missing ของ Neighborhood เป็น 0
filled_imputer["Neighborhood"] = filled_imputer["Neighborhood"].fillna(0).round().astype(int)

# ทำนายราคาด้วย Price Model
pred_price_imputer = price_model.predict(filled_imputer)[0]

# แปลง Neighborhood เป็นชื่อจริงสำหรับแสดงผล
filled_imputer["Neighborhood"] = filled_imputer["Neighborhood"].map(lambda x: neighborhood_mapping.get(x, "Unknown"))

# -----------------------------
# 5 Reverse Prediction (ทำนาย features จาก SalePrice)
# -----------------------------
#if user_input["SalePrice"] is None:
#    guessed_features = {}
#    for f in features:
#        guessed_value = reverse_models[f].predict([[pred_price_imputer]])[0]
#        if f in int_features:
#            guessed_value = int(round(guessed_value))
#        if f == "Neighborhood":
#            guessed_value = int(round(guessed_value))
#        guessed_features[f] = guessed_value

    # สร้าง DataFrame สำหรับ price_model
#    filled_reverse = pd.DataFrame([guessed_features])
#    pred_price_reverse = price_model.predict(filled_reverse)[0]

    # แปลง Neighborhood เป็นชื่อจริง
#    filled_reverse["Neighborhood"] = filled_reverse["Neighborhood"].map(lambda x: neighborhood_mapping.get(x, "Unknown"))
#else:
#    filled_reverse = None
#    pred_price_reverse = None

# -----------------------------
#  แสดงผล
# -----------------------------
print("===  RESULT COMPARISON (Single Prediction with Neighborhood names) ===")
print(f"1 Imputer Prediction:    Price = {pred_price_imputer:,.0f}, Features =\n{filled_imputer.iloc[0].to_dict()}")
#print(f"2 Reverse Prediction:    Price = {pred_price_reverse:,.0f}, Features =\n{filled_reverse.iloc[0].to_dict()}")


===  RESULT COMPARISON (Single Prediction with Neighborhood names) ===
1 Imputer Prediction:    Price = 184,703, Features =
{'OverallQual': 7, 'TotalBsmtSF': 1125, 'LotArea': 8000, 'GarageCars': 2, 'Fireplaces': 0, 'BedroomAbvGr': 3, 'GrLivArea': 1500, 'FullBath': 2, 'Neighborhood': 'North Ames'}


In [21]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error

# -----------------------------
# โหลดโมเดล Imputer
# -----------------------------
try:
    imputer_model = joblib.load("ml_model/imputer_model.joblib")
    print("Imputer Model loaded successfully.\n")
except FileNotFoundError:
    print("ERROR: imputer_model.joblib not found.")
    exit()
neighborhood_list = [
    "Blmngtn", "Blueste", "Briardl", "Brooksd", "ClearCr",
    "CollgCr", "Crawfor", "Edwards", "Gilbert", "IDOTRR",
    "MeadowV", "Mitchel", "NWAmes", "NoRidge", "NPkVill",
    "NridgHt", "NwAmes", "OldTown", "SWISU", "Sawyer",
    "SawyerW", "Somerst", "StoneBr", "Timber", "Veenker"
]

neighborhood_to_num = {name: i for i, name in enumerate(neighborhood_list)}
# -----------------------------
# โหลด "ข้อมูลจริงที่สมบูรณ์"
# -----------------------------
try:
    full_data = pd.read_csv("train_cleaned.csv")
except FileNotFoundError:
    print("ERROR: ไม่พบไฟล์ข้อมูลดิบสำหรับใช้เป็น 'เฉลย'")
    print("โปรดระบุตำแหน่งไฟล์ข้อมูลที่ถูกต้อง")
    exit()

# คอลัมน์ที่โมเดล Imputer ถูกเทรนมา
features = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath", "Neighborhood"
]

# คัดลอกข้อมูลเฉพาะคอลัมน์ที่เกี่ยวข้อง
data_truth = full_data[features].copy()

print("Converting 'Neighborhood' strings to numbers...")
data_truth['Neighborhood'] = data_truth['Neighborhood'].map(neighborhood_to_num)

data_truth['Neighborhood'] = data_truth['Neighborhood'].fillna(0).astype(int)
# -----------------------------
# สร้าง เจาะจงลบข้อมูล
# -----------------------------
data_with_holes = data_truth.copy()

# สมมติทดสอบ 2 ฟีเจอร์: GarageCars และ TotalBsmtSF
# สุ่มลบข้อมูล 20% ของ 2 คอลัมน์นี้
features_to_test = ['GarageCars', 'TotalBsmtSF']
percent_missing = 0.2

print(f"--- Artificially creating {percent_missing*100}% missing values in {features_to_test} ---")

# เก็บเฉลยและindex ของแถวที่เราลบ
original_values = {}
missing_indices = {}

for col in features_to_test:
    # สุ่ม index ที่จะลบ
    np.random.seed(42) # เพื่อให้ผลลัพธ์คงที่
    n_missing = int(len(data_with_holes) * percent_missing)
    idx_to_remove = np.random.choice(data_with_holes.index, n_missing, replace=False)

    # เก็บ "เฉลย" (เฉพาะแถวที่ถูกลบ)
    original_values[col] = data_truth.loc[idx_to_remove, col].copy()
    missing_indices[col] = idx_to_remove

    # สั่งลบข้อมูล (แทนที่ด้วย NaN)
    data_with_holes.loc[idx_to_remove, col] = np.nan
    print(f"Created {len(idx_to_remove)} missing values in '{col}'")

print("--------------------------------------------------\n")

# -----------------------------
# ใช้ Imputer เติมค่าที่หายไป
# -----------------------------
print("Running imputer_model.transform()...")
# Imputer รับ DataFrame และคืนค่าเป็น NumPy array
imputed_array = imputer_model.transform(data_with_holes)

# แปลงกลับเป็น DataFrame
imputed_data = pd.DataFrame(imputed_array, columns=features, index=data_truth.index)
print("Imputation complete.\n")

# -----------------------------
# เปรียบเทียบ "ค่าที่เติม" กับ "ค่าจริง"
# -----------------------------
print("--- Imputer Performance Metrics ---")

for col in features_to_test:
    # "เฉลย"
    truth = original_values[col]

    # "ค่าที่โมเดลเติม" (เฉพาะแถวที่เคยหายไป)
    idx = missing_indices[col]
    predicted = imputed_data.loc[idx, col]

    # คำนวณ Error
    # (ต้องปัดเศษ predicted ก่อน ถ้า feature นั้นเป็น int)
    if col == 'GarageCars':
        predicted = predicted.round()

    mae = mean_absolute_error(truth, predicted)
    rmse = np.sqrt(mean_squared_error(truth, predicted))

    print(f"Feature: '{col}'")
    print(f"  MAE (Mean Absolute Error): {mae:.4f}")
    print(f"  RMSE (Root Mean Squared Error): {rmse:.4f}\n")

print("MAE :เฉลี่ยแล้ว โมเดล เดาค่าที่หายไปผิดพลาดไปเท่าไหร่")
print("โมเดลเดาจำนวนที่จอดรถผิดพลาดไปเฉลี่ยประมาณ 0.-- คัน")

Imputer Model loaded successfully.

Converting 'Neighborhood' strings to numbers...
--- Artificially creating 20.0% missing values in ['GarageCars', 'TotalBsmtSF'] ---
Created 292 missing values in 'GarageCars'
Created 292 missing values in 'TotalBsmtSF'
--------------------------------------------------

Running imputer_model.transform()...
Imputation complete.

--- Imputer Performance Metrics ---
Feature: 'GarageCars'
  MAE (Mean Absolute Error): 0.3836
  RMSE (Root Mean Squared Error): 0.6621

Feature: 'TotalBsmtSF'
  MAE (Mean Absolute Error): 237.9986
  RMSE (Root Mean Squared Error): 335.7427

--------------------------------------------------
MAE หมายถึง: โดยเฉลี่ยแล้ว โมเดล 'เดา' ค่าที่หายไปผิดพลาดไปเท่าไหร่
เช่น ถ้า MAE ของ 'GarageCars' = 0.45 หมายความว่า
โมเดลเดาจำนวนที่จอดรถผิดพลาดไปเฉลี่ยประมาณ 0.45 คัน


โหลดโมเดล
เตรียมข้อมูล
จำลองสถานการณ์ ลบข้อมูลจริงทิ้งไป 20% (จำนวน 292 แถว) จาก 2 คอลัมน์ คือ GarageCars (จำนวนที่จอดรถ) และ TotalBsmtSF (ขนาดชั้นใต้ดิน)
สั่งให้โมเดล"เดา" หรือ "เติม" ค่าในช่องว่าง 292 ช่องนั้น
วัดผล เช่น ถ้า MAE ของ 'GarageCars' = 0.45