In [386]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [387]:
#loading the data
df = pd.read_csv("housing.csv")

In [388]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [389]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [390]:
#Split features and target
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

In [391]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,INLAND
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,INLAND
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,INLAND
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,INLAND


In [392]:
y

0        452600
1        358500
2        352100
3        341300
4        342200
          ...  
20635     78100
20636     77100
20637     92300
20638     84700
20639     89400
Name: median_house_value, Length: 20640, dtype: int64

In [393]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [394]:
#column types
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = ["ocean_proximity"]
num_features = [col for col in num_features if col != "ocean_proximity"]

In [395]:
#pipelines
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [396]:
categorical_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [397]:
#joining numeric pipeline and categorical pipeline
preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_features),
    ("cat", categorical_pipeline, cat_features)
])

In [398]:
#pipeline with linear regression
reg_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

In [399]:
#model training
reg_pipeline.fit(X_train, y_train)

In [400]:
#prediction
y_pred = reg_pipeline.predict(X_test)

In [401]:
#evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [402]:
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

Mean Squared Error: 2395290032.473151
Root Mean Squared Error: 48941.70034309343
Mean Absolute Error: 31628.40731104651
R² Score: 0.8172104989933294


In [403]:
# GUI

In [404]:
#import tkinter to build GUI
import tkinter as tk
from tkinter import ttk, messagebox

In [405]:
cat_options = df['ocean_proximity'].unique().tolist()

In [427]:
# Default values for inputs
default_values = {
    "Longitude": -122.23,
    "Latitude": 37.88,
    "Housing Median Age": 30,
    "Total Rooms": 1500,
    "Total Bedrooms": 300,
    "Population": 800,
    "Households": 300,
    "Median Income": 3.5
}

In [429]:
# Input validation
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

In [431]:
# Predict function
def predict():
    for key, entry in entries.items():
        if not is_float(entry.get()):
            messagebox.showerror("Invalid Input", f"'{key}' must be a numeric value.")
            return

    try:
        input_data = {
            'longitude': [float(entries["Longitude"].get())],
            'latitude': [float(entries["Latitude"].get())],
            'housing_median_age': [float(entries["Housing Median Age"].get())],
            'total_rooms': [float(entries["Total Rooms"].get())],
            'total_bedrooms': [float(entries["Total Bedrooms"].get())],
            'population': [float(entries["Population"].get())],
            'households': [float(entries["Households"].get())],
            'median_income': [float(entries["Median Income"].get())],
            'ocean_proximity': [ocean_proximity_var.get()]
        }

        input_df = pd.DataFrame(input_data)
        prediction = reg_pipeline.predict(input_df)[0]
        result_label.config(text=f"✅ Predicted Median House Value: ${prediction:,.2f}")

    except Exception as e:
        messagebox.showerror("Prediction Error", str(e))


In [433]:
# Clear fields
def clear_fields():
    for key, entry in entries.items():
        entry.delete(0, tk.END)
        entry.insert(0, str(default_values[key]))
    ocean_proximity_var.set(cat_options[0])
    result_label.config(text="")


In [435]:
# Show help
def show_help():
    help_text = (
        "💡 Input Help:\n"
        "- Longitude / Latitude: Coordinates of the house\n"
        "- Median Age: Median age of homes in the block\n"
        "- Total Rooms / Bedrooms: In the block (not just one house)\n"
        "- Population: Number of people in the block\n"
        "- Households: Number of households in the block\n"
        "- Median Income: Average income in the area (in tens of thousands)\n"
        "- Ocean Proximity: How close the block is to the ocean"
    )
    messagebox.showinfo("Feature Help", help_text)

In [437]:
# GUI setup
root = tk.Tk()
root.title("🏠 House Price Predictor")
root.geometry("450x550")
root.configure(bg="#f0f8ff")

In [439]:
# Style
style = ttk.Style()
style.configure("TLabel", background="#f0f8ff", font=("Arial", 10))
style.configure("TButton", font=("Arial", 10, "bold"))
style.configure("TEntry", font=("Arial", 10))
style.configure("TOptionMenu", font=("Arial", 10))

In [441]:
# Input fields
entries = {}
labels = list(default_values.keys())

In [443]:
for i, label_text in enumerate(labels):
    label = ttk.Label(root, text=label_text)
    label.grid(row=i, column=0, padx=10, pady=5, sticky='e')
    entry = ttk.Entry(root)
    entry.grid(row=i, column=1, padx=10, pady=5)
    entry.insert(0, str(default_values[label_text]))
    entries[label_text] = entry

In [445]:
# Ocean proximity dropdown
ocean_proximity_var = tk.StringVar()
ocean_proximity_var.set(cat_options[0])
label = ttk.Label(root, text="Ocean Proximity")
label.grid(row=len(labels), column=0, padx=10, pady=5, sticky='e')
dropdown = ttk.OptionMenu(root, ocean_proximity_var, cat_options[0], *cat_options)
dropdown.grid(row=len(labels), column=1, padx=10, pady=5)

In [447]:
# Buttons
predict_button = ttk.Button(root, text="Predict", command=predict)
predict_button.grid(row=len(labels)+1, column=0, columnspan=2, pady=10)

clear_button = ttk.Button(root, text="Clear", command=clear_fields)
clear_button.grid(row=len(labels)+2, column=0, columnspan=2, pady=5)

help_button = ttk.Button(root, text="?", width=3, command=show_help)
help_button.grid(row=0, column=2, padx=5)

# Result label
result_label = ttk.Label(root, text="", font=("Arial", 12, "bold"), foreground="green")
result_label.grid(row=len(labels)+3, column=0, columnspan=2, pady=15)

In [449]:
#run gui
root.mainloop()