In [1]:
import sys
from pathlib import Path

notebook_path = Path.cwd()
project_root_path = notebook_path.parent
sys.path.append(str(project_root_path))

In [2]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from predict_house_value.config.config import FilePathConstants
from predict_house_value.preprocess.preprocess_data import prepare_data_to_train, preprocess_data, write_processed_data
from predict_house_value.train.train_model import train_model, save_model
from predict_house_value.prediction.prediction import load_regressor_model, predict

## Prepare Data

In [3]:
raw_df = pd.read_csv(FilePathConstants.RAW_DATA_PATH / 'housing.csv', na_values=['Null'])

In [4]:
raw_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,agency
0,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,YES
1,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,YES
2,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,YES
3,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,YES
4,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,YES
...,...,...,...,...,...,...,...,...,...,...,...
20634,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,YES
20635,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,YES
20636,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,YES
20637,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,YES


In [5]:
preprocessed_data = preprocess_data(raw_df)

In [6]:
preprocessed_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,agency,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,YES,0,0,0,1,0
1,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,YES,0,0,0,1,0
2,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,YES,0,0,0,1,0
3,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,YES,0,0,0,1,0
4,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,YES,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20634,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,YES,0,1,0,0,0
20635,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,YES,0,1,0,0,0
20636,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,YES,0,1,0,0,0
20637,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,YES,0,1,0,0,0


In [7]:
x_training_data, x_test_data, y_training_data, y_test_data = prepare_data_to_train(preprocessed_data)

In [8]:
x_training_data, x_test_data, y_training_data, y_test_data

(       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
 15627    -122.40     37.80                52.0       2094.0           568.0   
 19881    -119.24     36.33                 9.0       3289.0           621.0   
 3882     -118.52     34.22                35.0       1620.0           272.0   
 19289    -122.84     38.42                29.0       2756.0           551.0   
 872      -122.02     37.56                35.0       1716.0           312.0   
 ...          ...       ...                 ...          ...             ...   
 16488    -121.18     38.07                21.0       2333.0           377.0   
 80       -122.27     37.81                52.0        210.0            56.0   
 12261    -116.95     33.78                24.0       3409.0           804.0   
 14308    -117.13     32.71                35.0        614.0           180.0   
 5725     -118.22     34.19                31.0       4704.0           920.0   
 
        population  households  median

In [9]:
regressor = train_model(x_training_data, y_training_data)

In [10]:
save_model(regressor, FilePathConstants.MODEL_FILE_PATH / 'model.joblib')

## Predict

In [11]:
model = load_regressor_model(FilePathConstants.MODEL_FILE_PATH / 'old' / 'model.joblib')

In [12]:
y_predicted_train_data = predict(x_training_data, model)

In [13]:
y_predicted_test_data = predict(x_test_data, model)

In [14]:
train_error = mean_absolute_error(y_training_data, y_predicted_train_data)
test_error = mean_absolute_error(y_test_data, y_predicted_test_data)

In [15]:
print(train_error)
print(test_error)

25983.21837106039
25753.94549131189
