## Part 1: Data Loading and Exploration

In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

#Part 1: Data Loading and Exploration
# Load the housing dataset
housing = fetch_california_housing()


X = pd.DataFrame(housing.data, columns=housing.feature_names) 
y = pd.Series(housing.target, name='med_house_value')

# Display the first 5 rows of the dataset
print(X.head(5))

# Check for missing values
print(X.isnull().sum())

# Display Summary Statistics
print(X.describe())



   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122

## Part 2: Linear Regression on Unscaled Data

In [3]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_test)
y_pred



array([0.71912284, 1.76401657, 2.70965883, ..., 4.46877017, 1.18751119,
       2.00940251])

In [4]:
# Evaluate Model Performance
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score

mse_lin = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2_lin = r2_score(y_test, y_pred)

print("Data Model:")
print(f"Mean Squared Error: {mse_lin:.2f}")
print(f"Root Squared Error: {rmse:.2f}")
print(f"R² Score: {r2_lin:.2f}")

# View our model's coefficients
coef_series = pd.Series(lin_reg.coef_, index=  X.columns)
print(f'Model Coefficients: \n{coef_series}')

Data Model:
Mean Squared Error: 0.56
Root Squared Error: 0.75
R² Score: 0.58
Model Coefficients: 
MedInc        0.448675
HouseAge      0.009724
AveRooms     -0.123323
AveBedrms     0.783145
Population   -0.000002
AveOccup     -0.003526
Latitude     -0.419792
Longitude    -0.433708
dtype: float64


### Interpretation Questions
- The R² score tells us that thee proportion of the regression model that can be explained by the predictor variables is 0.58 

- The feature that seems to have the stronger impact on the prediction is the Average Bedrooms ('AveBedrms'). If you look at the coefficients, that is what each feautre, or X variable, will be multiplied by. So, whichever coefficient with the greatest magnitude will have the strongest impact

- The predicted values on average deviated from the observed values by .75 . Since the range of the feature variables vary by each label, a RMSE of .75 is high for some of the labels while for others it represents a smaller error. 

## Part 4: Feature Selection and Simplified Model

In [None]:
# This simplified model will only include MedInc, AveRooms, and AveBedrms
# I believe these three features are very important aspects of information surrounding housing prices or just understanding the socio-economics of certain areas.
simple_model = X.loc[:, ['MedInc', 'AveRooms', 'AveBedrms']]
print(simple_model)

X_simple = simple_model

X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(X_simple, y, test_size= 0.2, random_state=42)
lin_reg_simple = LinearRegression()
lin_reg_simple.fit(X_train_simple, y_train_simple)

y_pred_simple = lin_reg_simple.predict(X_test_simple)
y_pred_simple

       MedInc  AveRooms  AveBedrms
0      8.3252  6.984127   1.023810
1      8.3014  6.238137   0.971880
2      7.2574  8.288136   1.073446
3      5.6431  5.817352   1.073059
4      3.8462  6.281853   1.081081
...       ...       ...        ...
20635  1.5603  5.045455   1.133333
20636  2.5568  6.114035   1.315789
20637  1.7000  5.205543   1.120092
20638  1.8672  5.329513   1.171920
20639  2.3886  5.254717   1.162264

[20640 rows x 3 columns]


array([1.09794478, 1.55080224, 2.31795302, ..., 4.35435412, 1.38076318,
       2.17579031])

In [49]:
# Evaluate Model Performance
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score

mse_lin_s = mean_squared_error(y_test_simple, y_pred_simple)
rmse_s = root_mean_squared_error(y_test_simple, y_pred_simple)
r2_lin_s = r2_score(y_test_simple, y_pred_simple)


print(f"Mean Squared Error: Simplified Model:{mse_lin_s:.2f} \nFull Model:{mse_lin:.2f} ")
print(f"\nRoot Squared Error: Simplified Model:{rmse_s:.2f} \nFull Model:{rmse:.2f} ")
print(f"\nR² Score: Simplified Model:{r2_lin_s:.2f} \nFull Model:{r2_lin:.2f} ")



Mean Squared Error: Simplified Model:0.68 
Full Model:0.56 

Root Squared Error: Simplified Model:0.82 
Full Model:0.75 

R² Score: Simplified Model:0.48 
Full Model:0.58 


### Comparison Between Simplified and Full Model
### How does the simplified model compare to the full model
- The full model seems to have smaller errors between the predictions and actual y values
- The full model seems to have a higher porportion of the regression model be explained by the predictor variables
### Would you use this simplified model in practice
-  I would use the simplified model in practice because it can be used to understand datasets that may be very complex and to find what exact features contribute to the accuracy of the model

## Part 3: Scaled Data

In [14]:
from sklearn.preprocessing import StandardScaler

# Intialize the scaler and apply it to the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

#Split scaled data
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

#Fit the scaled data
lin_reg_scaled = LinearRegression()
lin_reg_scaled.fit(X_train_scaled, y_train_scaled)

#Make predictions 
y_pred_scaled = lin_reg_scaled.predict(X_test_scaled)
y_pred_scaled

# Evaluate Model Performance
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
mse_lin_s = mean_squared_error(y_test_scaled, y_pred_scaled)
rmse_s = root_mean_squared_error(y_test_scaled, y_pred_scaled)
r2_lin_s = r2_score(y_test_scaled, y_pred_scaled)

print("Scaled Data Model:")
print(f"Mean Squared Error: {mse_lin_s:.2f}")
print(f"Root Squared Error: {rmse_s:.2f}")
print(f"R² Score: {r2_lin_s:.2f}")

coef_series = pd.Series(lin_reg_scaled.coef_, index=  X.columns)
print(f'Model Coefficients: \n{coef_series}')

Scaled Data Model:
Mean Squared Error: 0.56
Root Squared Error: 0.75
R² Score: 0.58
Model Coefficients: 
MedInc        0.852382
HouseAge      0.122382
AveRooms     -0.305116
AveBedrms     0.371132
Population   -0.002298
AveOccup     -0.036624
Latitude     -0.896635
Longitude    -0.868927
dtype: float64
