## Define Constants Data

In [None]:
import os
import random

random_seed = 32
random.seed(random_seed)

## Load Data

In [16]:
import pandas as pd
import os

# List files in the current directory
os.listdir('/content')
print(os.getcwd())
#housing = pd.read_csv("/housing/housing.csv")

/content


## Analyzing the Data

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing.isna().sum()

In [None]:
set(housing["ocean_proximity"].values)

## Define Target and Explatory Variables

In [None]:
import numpy as np

target_variable = "median_house_value"
explanatory_variables = housing.columns.values.tolist()
explanatory_variables.remove(target_variable)

## Define Training and Test Sets

In [None]:
X = housing[explanatory_variables]
y = housing[target_variable]

print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Define Preprocessing Pipeline

#### Define Numerical and Categorical Variables

In [None]:
categorical_variables = ["ocean_proximity"]
numerical_variables = [var for var in explanatory_variables if var not in categorical_variables]

print(categorical_variables)
print(numerical_variables)

#### Define Custom Transformer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class MyTransformer(BaseEstimator, TransformerMixin):

    HOUSEHOLDS_IDX = 6
    TOTAL_ROOMS_IDX = 3
    TOTAL_BEDROOMS_IDX = 4
    POPULATION_IDX = 5

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, self.TOTAL_ROOMS_IDX] / X[:, self.HOUSEHOLDS_IDX]
        bedrooms_per_household = X[:, self.TOTAL_BEDROOMS_IDX] / X[:, self.HOUSEHOLDS_IDX]
        population_per_household = X[:, self.POPULATION_IDX] / X[:, self.HOUSEHOLDS_IDX]

        return np.concatenate((X,
                               rooms_per_household.reshape(-1, 1),
                               bedrooms_per_household.reshape(-1, 1),
                               population_per_household.reshape(-1, 1)), axis=1)

#### Define Numerical Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("my_transformer", MyTransformer()),
    ("scaler", StandardScaler())
])

#### Define Preprocessing Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessing_pipeline = ColumnTransformer([
    ("num_pipeline", numerical_pipeline, numerical_variables),
    ("encoder", OneHotEncoder(), categorical_variables)
])

In [None]:
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)
X_train_transformed

In [None]:
print(X_train_transformed.shape)

In [None]:
np.isnan(X_train_transformed).sum()

## Define Final Pipeline

In [None]:
from sklearn.linear_model import LinearRegression

regressor_pipeline = Pipeline([
    ("preprocessing", preprocessing_pipeline),
    ("regressor", LinearRegression())
])

In [None]:
regressor_pipeline.fit(X_train, y_train)

In [None]:
regressor_pipeline.score(X_train, y_train)

In [None]:
regressor_pipeline.score(X_test, y_test)

In [None]:
predictions_train = regressor_pipeline.predict(X_train)
predictions_train

In [None]:
predictions_test = regressor_pipeline.predict(X_test)
predictions_test

In [None]:
from sklearn.metrics import mean_squared_error as mse

In [None]:
np.sqrt(mse(y_train, predictions_train))

In [None]:
np.sqrt(mse(y_test, predictions_test))

In [None]:
regressor_pipeline.named_steps["regressor"].coef_

In [None]:
from sklearn.linear_model import LinearRegression


Knn = Pipeline([
    ("preprocessing", preprocessing_pipeline),
    ("regressor", KNeighborsRegressor(n = 5))
])

Knn.fit(X_train, y_train)