In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
#Loading the data

df = fetch_california_housing(as_frame=True)
df = df.frame

In [4]:
df 

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


**Goal:** Predict MedHouseVal (Median House Value).

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [6]:
df.columns = df.columns.str.lower()
df.columns

Index(['medinc', 'houseage', 'averooms', 'avebedrms', 'population', 'aveoccup',
       'latitude', 'longitude', 'medhouseval'],
      dtype='object')

In [7]:
df

Unnamed: 0,medinc,houseage,averooms,avebedrms,population,aveoccup,latitude,longitude,medhouseval
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [12]:
#Testing and training split

y=df['medhouseval']
X=df.drop('medhouseval',axis=1)

X_test,X_train,y_test,y_train = train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
#Applying the RandomForestRegressor model

model = RandomForestRegressor(random_state=42)
model.fit(X_train,y_train)
predictions = model.predict(X_test)

mse = mean_squared_error(y_test,predictions)
score = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Model R2 Score: {score:.4f}")

Mean Squared Error: 0.31
Model R2 Score: 0.7685


In [14]:
#Applying XGBPegressor model

model = XGBRegressor()
model.fit(X_train,y_train)
predictions = model.predict(X_test)

mse = mean_squared_error(y_test,predictions)
score = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Model R2 Score: {score:.4f}")

Mean Squared Error: 0.27
Model R2 Score: 0.7969


In [26]:
#predicting a requirement

cheap_home = pd.DataFrame({
    'medinc': [2.1],       # Low Income ($21k)
    'houseage': [20.0],
    'averooms': [4.5],
    'avebedrms': [1.05],
    'population': [1200.0],
    'aveoccup': [3.1],
    'latitude': [36.5],    # Central Valley (Cheaper area)
    'longitude': [-119.5]
})


prediction = model.predict(cheap_home)

print(f"Cheap House Prediction: ${prediction[0] * 100_000:,.2f}")

custom_home = pd.DataFrame({
    'medinc': [8.3252],
    'houseage': [41.0],
    'averooms': [6.984],
    'avebedrms': [1.023],
    'population': [322.0],
    'aveoccup': [2.555],
    'latitude': [37.88],   # Near Berkeley/San Francisco
    'longitude': [-122.23]
})

prediction = model.predict(custom_home)

price_in_dollars = prediction[0] * 100_000

print(f"Custom House Price: ${price_in_dollars:,.2f}")

Cheap House Prediction: $73,298.98
Custom House Price: $449,262.62
