# california housing data analysis

# dataset description

# Dataset Overview:

# Features:

# Target Variable:

# Relationship Between Features and Target Variable:

# Step 1: Data Import and Exploration

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [2]:
# Load the California housing dataset
data = fetch_california_housing()

In [3]:
data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
# Convert the data into a pandas DataFrame for easier exploration
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target  # Adding target variable (house values)

In [6]:
# Display basic information about the dataset
df.head()  # View the first few rows

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [7]:
print(df.info())  # Get data types, null values, etc.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   Target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
None


In [8]:
print(df.describe())  # Statistical summary of the features

             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude        Target  
count  20640.000000  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704      2.068558  
std       10.386050      2.135952      2.003532      1.153956  
min        0.692308     32.54000

# Step 2: Data Preprocessing

In [9]:
# Check for missing values
print(df.isnull().sum())

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64


# Feature Scaling:

In [10]:
from sklearn.preprocessing import StandardScaler

In [15]:
# Separating features (X) and target (y)
x = df.drop('Target', axis=1)  # Features
y = df['Target']  # Target variable (house value)

In [17]:
x.shape

(20640, 8)

In [18]:
y.shape

(20640,)

In [20]:
# Scaling the features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)

In [21]:
print(x_scaled)

[[ 2.34476576  0.98214266  0.62855945 ... -0.04959654  1.05254828
  -1.32783522]
 [ 2.33223796 -0.60701891  0.32704136 ... -0.09251223  1.04318455
  -1.32284391]
 [ 1.7826994   1.85618152  1.15562047 ... -0.02584253  1.03850269
  -1.33282653]
 ...
 [-1.14259331 -0.92485123 -0.09031802 ... -0.0717345   1.77823747
  -0.8237132 ]
 [-1.05458292 -0.84539315 -0.04021111 ... -0.09122515  1.77823747
  -0.87362627]
 [-0.78012947 -1.00430931 -0.07044252 ... -0.04368215  1.75014627
  -0.83369581]]


# Step 3: Splitting the Dataset

In [22]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 4: Model Selection and Training

In [23]:
# We will now initialize and train multiple regression models on the training data.
# Importing Regression Models:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [24]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Support Vector Regressor": SVR()
}

In [25]:
# Train all models and store the trained models
trained_models = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    trained_models[model_name] = model  # Store the model

# Step 5: Model Evaluation

In [26]:
from sklearn.metrics import mean_squared_error

# Evaluate each model and store the MSE
mse_results = {}
for model_name, model in trained_models.items():
    y_pred = model.predict(X_test)  # Get predictions on the test set
    mse = mean_squared_error(y_test, y_pred)  # Calculate MSE
    mse_results[model_name] = mse  # Store MSE

In [27]:
# Display MSE for each model
mse_df = pd.DataFrame(mse_results.items(), columns=["Model", "MSE"])
mse_df

Unnamed: 0,Model,MSE
0,Linear Regression,0.555892
1,Ridge Regression,0.555851
2,Lasso Regression,1.310696
3,ElasticNet Regression,1.042981
4,Decision Tree Regressor,0.490209
5,Random Forest Regressor,0.256284
6,Gradient Boosting Regressor,0.29408
7,Support Vector Regressor,0.355198


# Step 6: Model Interpretation

# Step 7: Conclusions and Recommendations

In [28]:
# Example of interpreting MSE results
best_model_name = mse_df.loc[mse_df["MSE"].idxmin(), "Model"]
best_model_mse = mse_df.loc[mse_df["MSE"].idxmin(), "MSE"]
print(f"The best model is {best_model_name} with MSE = {best_model_mse:.4f}")

The best model is Random Forest Regressor with MSE = 0.2563


# Data Analysis Report on California Housing Dataset

# Statistical Analysis Report on the California Housing Dataset