# California House Price Prediction

## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the dataset

In [None]:
df = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')

## Display dataset info and first few rows

In [None]:
print("Dataset Overview:")
print(df.info())
print(df.head())

## Check for missing values

In [None]:
print("\nMissing Values:")
print(df.isnull().sum())

## Handling the missing values

In [None]:
df['total_bedrooms'].fillna(df['total_bedrooms'].mean(),inplace=True)

## View values to map

In [None]:
df['ocean_proximity'].value_counts()

## One-hot encoding step

In [None]:
dummy_data=pd.get_dummies(df['ocean_proximity']).astype('int')
print(dummy_data)

## Concatenate the new data

In [None]:
df = pd.concat([df,dummy_data],axis=1)
df

In [None]:
df.drop('ocean_proximity',axis=1,inplace=True)
df

## Define features (X) and target (y)

In [None]:
x = df.drop(columns=['median_house_value']).values
y = df["median_house_value"].values

## Split data into training and test sets

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Normalizing the data

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Train Linear Regression

In [None]:
LR = LinearRegression()
LR.fit(x_train, y_train)

y_pred = LR.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mean_abs_error = mean_absolute_error(y_test, y_pred)

print("\nLinear Regression Model:")
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R2 Score:", r2)
print("Mean Absolute Error:", mean_abs_error)

## Train KNN Regressor

In [None]:
KNN = KNeighborsRegressor()
KNN.fit(x_train, y_train)

y_pred_knn = KNN.predict(x_test)

mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
r2_knn = r2_score(y_test, y_pred_knn)
mean_absolute_error_knn = mean_absolute_error(y_test, y_pred_knn)

print("\nK-Nearest Neighbors Model:")
print("Mean Squared Error:", mse_knn)
print("Root Mean Squared Error:", rmse_knn)
print("R2 Score:", r2_knn)
print("Mean Absolute Error:", mean_absolute_error_knn)

## Visualization

In [None]:
plt.figure(figsize=(12, 6))

### Linear Regression predictions

In [None]:
plt.subplot(1, 2, 1)
sns.scatterplot(x=y_test, y=y_pred, color="blue", alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.title("Linear Regression Predictions")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")

### KNN Regression predictions

In [None]:
plt.subplot(1, 2, 2)
sns.scatterplot(x=y_test, y=y_pred_knn, color="green", alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.title("KNN Regression Predictions")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")

plt.tight_layout()
plt.show()