# Practice Project: House Price Prediction using ML Pipelines 🚀
This project will use Linear Regression to predict house prices based on multiple features. The dataset will contain both numerical and categorical variables, requiring preprocessing with:

1. Ordinal Encoding (for ordered categories like condition: Poor < Fair < Good)
2. One-Hot Encoding (for nominal categories like location)
3. SimpleImputer (to handle missing values)
4. Scaler (to standardize numerical features)
5. Pipeline (to automate preprocessing + model training)

In [28]:
import pandas as pd 
import numpy as np  

In [29]:
# Step 1: Create a Sample Dataset
data = {
    "area": [1200, 1500, 1800, np.nan, 2500, 3000, 3500],
    "bedrooms": [2, 3, 3, 2, 4, np.nan, 4],
    "age": [5, 10, np.nan, 15, 20, 25, 30],
    "condition": ["Good", "Fair", "Poor", "Good", "Fair", "Poor", "Good"],
    "location": ["New York", "Paris", "London", "Paris", "New York", "London", "Paris"],
    "price": [250000, 320000, 400000, 280000, 500000, 600000, 700000]
}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,area,bedrooms,age,condition,location,price
0,1200.0,2.0,5.0,Good,New York,250000
1,1500.0,3.0,10.0,Fair,Paris,320000
2,1800.0,3.0,,Poor,London,400000
3,,2.0,15.0,Good,Paris,280000
4,2500.0,4.0,20.0,Fair,New York,500000


In [30]:
df.isnull().sum()

area         1
bedrooms     1
age          1
condition    0
location     0
price        0
dtype: int64

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['price']), df['price'], test_size=0.2, random_state=42)

X_train

Unnamed: 0,area,bedrooms,age,condition,location
5,3000.0,,25.0,Poor,London
2,1800.0,3.0,,Poor,London
4,2500.0,4.0,20.0,Fair,New York
3,,2.0,15.0,Good,Paris
6,3500.0,4.0,30.0,Good,Paris


In [32]:
y_train

5    600000
2    400000
4    500000
3    280000
6    700000
Name: price, dtype: int64

# Pre Processing
1. bedrooms,age,areas = simpleimputer
2. condition = Ordianl
3.location = ohe

In [33]:
df['price']

0    250000
1    320000
2    400000
3    280000
4    500000
5    600000
6    700000
Name: price, dtype: int64

In [34]:
df

Unnamed: 0,area,bedrooms,age,condition,location,price
0,1200.0,2.0,5.0,Good,New York,250000
1,1500.0,3.0,10.0,Fair,Paris,320000
2,1800.0,3.0,,Poor,London,400000
3,,2.0,15.0,Good,Paris,280000
4,2500.0,4.0,20.0,Fair,New York,500000
5,3000.0,,25.0,Poor,London,600000
6,3500.0,4.0,30.0,Good,Paris,700000


In [35]:
num_features = ['bedrooms', 'age', 'area']

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [37]:
preproccesor = ColumnTransformer([
    ("num_features", Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), num_features),
    ('ordinal', OrdinalEncoder(categories=[['Poor','Fair','Good']]), ['condition']),
    ('ohe', OneHotEncoder(handle_unknown='ignore'), ['location'])
])

In [38]:
pipeline = Pipeline([
    ('preproccesor', preproccesor),
    ('model', LinearRegression())
])

In [39]:
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))

In [40]:
pipeline.fit(X_train,y_train_scaled)

In [41]:
y_pred = pipeline.predict(X_test)

In [42]:
y_pred

array([[-3.4312096 ],
       [-2.23590502]])

In [43]:
y_test

0    250000
1    320000
Name: price, dtype: int64

In [44]:
# Step 8: Inverse Transform Numerical Features (So Predictions Make Sense)
scaler = pipeline.named_steps["preproccesor"].named_transformers_["num_features"].named_steps["scaler"]

# Convert scaled numerical values back to original scale
X_test_original = X_test.copy()  # Keep original X_test for comparison
X_test_original[num_features] = scaler.inverse_transform(X_test[num_features])

# Step 9: Display Predictions in an Understandable Format
predictions_df = pd.DataFrame(X_test_original)
predictions_df["Condition"] = X_test["condition"]  # Restore condition column
predictions_df["Location"] = X_test["location"]  # Restore location column
predictions_df["Actual Price"] = y_test.values
predictions_df["Predicted Price"] = y_pred

# Show results
print(predictions_df)

            area  bedrooms   age condition  location Condition  Location  \
0  677266.527483   4.73324  47.5      Good  New York      Good  New York   
1  845908.159353   5.47486  72.5      Fair     Paris      Fair     Paris   

   Actual Price  Predicted Price  
0        250000        -3.431210  
1        320000        -2.235905  
