Install any necessary packages you need to run the following and test the code locally.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
import modules as md
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import bronze_to_silver_cleaning as btc
import preprocessing as pp
import feature_engineering as fe
import geopandas as gpd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline



We have decided to run with Random Forest.

In [2]:
model_choice = "Random Forest"

The following will create the final consolidated table for ML. Look into the code for details.

In [3]:
path = "data/housesigmadata"
combined_df = pp.combine_dataframes(path)
combined_df = combined_df[combined_df['city'].str.contains('Waterloo', case=False, na=False)]
combined_df['address'] = combined_df['address'].str.replace(' - Waterloo', '')
output = gpd.read_file('data/good_data/address_dictionary_neighbourhoods.geojson')
output = pd.DataFrame(output)
df_schools = pd.read_csv('data/good_data/schools.csv')
amenities = pd.read_csv('data/good_data/amenities.csv')
result_df = pp.process_housing(df_house_sigma=combined_df, output=output)
final_filled_df = pp.predict_missing_neighbourhoods(result_df)
final_filled_df = pp.add_school_details(final_filled_df, df_schools)
final_filled_df = pp.add_amenities_details(final_filled_df, amenities)
df_house_sigma = combined_df.drop(columns=['address'])
uploaded_file = pd.merge(df_house_sigma, final_filled_df, on='listing_id', how='inner')
houses = btc.clean_data(uploaded_file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_neighborhoods_x['neighbourhood'] = predicted_neighborhoods


In [4]:
houses.head()

Unnamed: 0,listing,listing_url,city,type,for_sale_price,listed,sold,sold_conditional,expired,event_hist,...,uffi,portion_for_rent,laundries_on_level_third,shoreline,soil_type,waterfront,water_body_name,shoreline_road_allowance,other,golf
0,"201 Brandenburg Court , Waterloo",https://housesigma.com/on/waterloo-real-estate...,Waterloo,Detached,,"$ 1,049,000","$ 1,030,000",,,"[{""event_hist"":""Sold""},{""event_hist"":""Sold""},{...",...,,,,,,,,,,
1,"757 Wood Lily Street , Waterloo",https://housesigma.com/on/waterloo-real-estate...,Waterloo,Single Family Residence,,"$ 1,249,000","$ 1,228,000",,,"[{""event_hist"":""Sold""},{""event_hist"":""Terminat...",...,,,,,,,,,,
2,"460 Drake Circle , Waterloo",https://housesigma.com/on/waterloo-real-estate...,Waterloo,Single Family Residence,,"$ 1,629,000","$ 1,457,500",,,"[{""event_hist"":""Sold""},{""event_hist"":""Terminat...",...,,,,,,,,,,
3,"605 Sundew Drive , Waterloo",https://housesigma.com/on/waterloo-real-estate...,Waterloo,Single Family Residence,,"$ 1,079,900","$ 1,110,000",,,"[{""event_hist"":""Sold""},{""event_hist"":""Terminat...",...,,,,,,,,,,
4,"621 Wild Rye Street , Waterloo",https://housesigma.com/on/waterloo-real-estate...,Waterloo,Single Family Residence,,"$ 1,050,000","$ 1,045,000",,,"[{""event_hist"":""Sold""},{""event_hist"":""Expired""...",...,,,,,,,,,,


The following will process the above and create a table that is ML ready.

In [None]:
if model_choice == "Random Forest":
    houses['neighbourhood_impact'] = pd.Categorical(houses['neighbourhood']).codes
    houses['roof'] = pd.Categorical(houses['roof']).codes
    houses['architecture_style'] = pd.Categorical(houses['architecture_style']).codes
    houses['frontage_type'] = pd.Categorical(houses['frontage_type']).codes

houses = houses.dropna(subset=['sold'])
ml_houses = fe.feature_refining(houses)

columns_to_encode = [
                        # 'property_type',
                        'driveway_parking',
                        'basement_type',
                        #  'bathrooms_detail', 'sewer', 'topography',
                        'lot_features',
                        'exterior_feature',
                        'waterfront_features', 
                        'appliances_included',
                        'laundry_features',
                        ]
split_exceptions = ['bathrooms_detail',]

if model_choice == "Ridge Regression":
    columns_to_encode += ['neighbourhood', 'architecture_style', 'roof', 'frontage_type']

# TODO: Appliances Excluded has to be penalizing in giving value to the prices

for column in columns_to_encode:
    if column in houses.columns:
        encoded_df = fe.one_hot_encode_column(houses, column, split_exceptions=split_exceptions)
        ml_houses = pd.concat([ml_houses, encoded_df], axis=1)

In [None]:
houses['depth'].fillna(ml_houses['depth'].mean())
ml_houses['frontage_length'].fillna(ml_houses['frontage_length'].mean())
ml_houses = ml_houses.fillna(0)

# This is the final dataframe that will be used for ML
# features == X and price == y

features = ml_houses.drop(columns=['listing_id', 'listing'])
price = ml_houses['price']

features = fe.correlation_analysis(features)