<a href="https://colab.research.google.com/github/Anshu366651/House-price-prediction-/blob/main/house_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install pandas scikit-learn joblib matplotlib



In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import joblib

# reproducible
RANDOM_STATE = 42
np.random.seed(0)

In [5]:
def generate_synthetic(n=120, coeff_area=0.0265, coeff_bed=6.8, loc_effects=None, noise_scale=4.5):
    if loc_effects is None:
        loc_effects = {"Chennai": 12, "Bangalore": 15, "Pune": 8, "Mumbai": 20}
    rows = []
    rng = np.random.RandomState(0)
    for _ in range(n):
        area = int(rng.uniform(800, 2500))
        bed = int(rng.choice([1,2,3,4]))
        loc = rng.choice(list(loc_effects.keys()))
        price = coeff_area * area + coeff_bed * bed + loc_effects[loc] + rng.normal(0, noise_scale)
        rows.append((area, bed, loc, price))
    return pd.DataFrame(rows, columns=["Area", "Bedrooms", "Location", "Price"])

# create dataset (Price is in Lakhs)
df = generate_synthetic()
print("Sample rows:\n", df.head())
print("Shape:", df.shape)


Sample rows:
    Area  Bedrooms   Location      Price
0  1732         2    Chennai  75.902321
1  1520         2       Pune  71.964019
2  1543         3    Chennai  72.825015
3  2145         4       Pune  93.890193
4  1765         2  Bangalore  77.369885
Shape: (120, 4)


In [6]:
TARGET = "Price"
X = df.drop(columns=[TARGET])
y = df[TARGET]

In [8]:
numeric_features = ["Area", "Bedrooms"]
categorical_features = ["Location"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# OneHotEncoder handle_unknown='ignore' keeps pipeline robust if new locations appear at predict time.
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (96, 3) Test: (24, 3)


In [10]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

model.fit(X_train, y_train)


In [11]:
y_pred_test = model.predict(X_test)

# R^2 (if test size is >=2)
if len(y_test) >= 2:
    r2 = r2_score(y_test, y_pred_test)
else:
    r2 = float("nan")  # not enough samples to compute an informative R^2

rmse = mean_squared_error(y_test, y_pred_test) ** 0.5

print(f"Model Performance on Test set:")
print(f"R^2 Score: {r2:.2f}")
print(f"RMSE    : {rmse:.2f}")


Model Performance on Test set:
R^2 Score: 0.88
RMSE    : 5.49


In [12]:
sample = pd.DataFrame([{
    "Area": 1300,
    "Bedrooms": 3,
    "Location": "Chennai"
}])

pred_price = model.predict(sample)[0]  # predicted price in Lakhs

print("\nFor input:")
print("Area : 1300, Bedrooms: 3, Location: Chennai")
print(f"Predicted House Price: {pred_price:.2f} Lakh")
print(f"Model Accuracy (R^2 Score): {r2:.2f}")



For input:
Area : 1300, Bedrooms: 3, Location: Chennai
Predicted House Price: 67.53 Lakh
Model Accuracy (R^2 Score): 0.88


In [13]:
MODEL_PATH = "house_price_model.joblib"
joblib.dump(model, MODEL_PATH)
print("Saved model to", MODEL_PATH)


Saved model to house_price_model.joblib


In [14]:
loaded = joblib.load(MODEL_PATH)
demo_sample = {"Area": 1300, "Bedrooms": 3, "Location": "Chennai"}
demo_df = pd.DataFrame([demo_sample])
print("Loaded model predicts:", float(loaded.predict(demo_df)[0]), "Lakh")


Loaded model predicts: 67.5343395347423 Lakh
