### HOUSE PREDICTION MODEL

Installing Libraries

In [None]:
pip install pandas numpy matplotlib seaborn

Importing Pandas Library and reading the csv file

In [None]:
import pandas as pd

df = pd.read_csv("train.csv")

Studying basic info of the csv file

In [None]:
print(df.shape)
print(df.head())
print(df.info())

Checking for missing value (null) count of each feature

In [None]:
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print(missing)


Summary of statistics

In [None]:
print(df.describe())   # numerical summary
print(df['SalePrice'].describe())   # target variable

Importing visulaization libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

Plotting Graphs

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df['SalePrice'], kde=True)
plt.title("Distribution of SalePrice")
plt.show()

# Correlation heatmap (numerical features only)
plt.figure(figsize=(12,8))
sns.heatmap(df.select_dtypes(include='number').corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Example: Relationship between Overall Quality and SalePrice
plt.figure(figsize=(8,5))
sns.boxplot(x='OverallQual', y='SalePrice', data=df)
plt.title("Overall Quality vs SalePrice")
plt.show()

Filling Missing Values

In [None]:
# Separate features and target
y = df["SalePrice"]           # <-- keep raw values
X = df.drop("SalePrice", axis=1)

# Separate numerical and categorical features
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

# Fill missing values
for col in num_features:
    X[col] = X[col].fillna(X[col].median())
for col in cat_features:
    X[col] = X[col].fillna(X[col].mode()[0])

# One-hot encode categoricals
X = pd.get_dummies(X, columns=cat_features, drop_first=True)

# Feature scaling (only X, not y)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])
print(y.head())
print(y.describe())

Encoding Categorical Features

In [None]:
# One-hot encoding for categorical columns
df = pd.get_dummies(df, columns=cat_features, drop_first=True)
print(df.shape)

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])


Feature and Target Selections

In [None]:
print(X.shape, y.shape)

Train, Test and Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Train model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred_lr = lr.predict(X_val)

# Evaluate with RMSE
rmse_lr = np.sqrt(mean_squared_error(y_val, y_pred_lr))

Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Train model
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# Predictions
y_pred_dt = dt.predict(X_val)

# Evaluate
rmse_dt = np.sqrt(mean_squared_error(y_val, y_pred_dt))

Comparing RMSE Values of Linear Regression and Decision Tree

In [None]:
print("Linear Regression RMSE:", rmse_lr)
print("Decision Tree RMSE:", rmse_dt)

### Ensemble Models

Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
print("Random Forest RMSE:", rmse_rf)

XGBoost Regressor

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=4, random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
print("XGBoost RMSE:", rmse_xgb)

Comparison of All Models

In [None]:
print("Linear Regression RMSE:", rmse_lr)
print("Decision Tree RMSE:", rmse_dt)
print("Random Forest RMSE:", rmse_rf)
print("XGBoost RMSE:", rmse_xgb)

Feature Importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Random Forest feature importance
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
top_features = feat_importances.nlargest(20)

plt.figure(figsize=(10,6))
sns.barplot(x=top_features, y=top_features.index)
plt.title("Top 20 Important Features (Random Forest)")
plt.show()


### Cross Validation And Hyperparameter Tuning

CV For Random Forest

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Negative MSE → convert to RMSE
scores = cross_val_score(rf, X, y, cv=5, scoring="neg_mean_squared_error")
rmse_scores = np.sqrt(-scores)

print("Cross-validation RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Std Dev:", rmse_scores.std())

Hyperparameter Tuning For RandomForest

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)

search = RandomizedSearchCV(
    rf,
    param_distributions=param_grid,
    n_iter=10,  # number of random configs
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    random_state=42
)

search.fit(X, y)

print("Best parameters:", search.best_params_)
print("Best RMSE:", np.sqrt(-search.best_score_))

Hyperparameter Tuning For XGB

In [None]:
param_grid_xgb = {
    'n_estimators': [300, 500, 800],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0]
}

xgb = XGBRegressor(random_state=42)

search_xgb = RandomizedSearchCV(
    xgb,
    param_distributions=param_grid_xgb,
    n_iter=10,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    random_state=42
)

search_xgb.fit(X, y)

print("Best parameters (XGB):", search_xgb.best_params_)
print("Best RMSE (XGB):", np.sqrt(-search_xgb.best_score_))

Model Leaderboard

In [None]:
import pandas as pd

results = {
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost"],
    "RMSE": [rmse_lr, rmse_dt, rmse_rf, rmse_xgb]
}

df_results = pd.DataFrame(results).sort_values(by="RMSE")
print(df_results)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=4, random_state=42)
}

In [None]:
results = {}

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
    rmse_scores = np.sqrt(-scores)
    results[name] = rmse_scores.mean()

# Sort results
results = dict(sorted(results.items(), key=lambda x: x[1]))
print("Model Leaderboard (lower RMSE is better):")
for name, score in results.items():
    print(f"{name}: {score:.2f}")

In [None]:
best_model_name = list(results.keys())[0]
best_model = models[best_model_name]

# Train best model on full dataset
best_model.fit(X, y)

print(f"✅ Best model selected: {best_model_name}")

In [None]:
import joblib

joblib.dump(best_model, "best_model.pkl")
print("Model saved as best_model.pkl")

In [None]:
# Example of loading and using the model later
loaded_model = joblib.load("best_model.pkl")

sample = X.iloc[[0]]  # take first row
print("Prediction:", best_model.predict(sample))