In [None]:
import os
from collections import namedtuple
from collections import OrderedDict

import pandas as pd
import numpy as np
import geopandas as gpd
from geopy.distance import geodesic
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('housing.csv')
cal_cities = pd.read_csv('cal_cities_lat_long.csv')
cal_pops_cities = pd.read_csv('cal_populations_city.csv')
cal_pops_counties = pd.read_csv('cal_populations_county.csv')

In [None]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.loc[X_train['ocean_proximity'] == '<1H OCEAN', 'ocean_proximity'] = 'WITHIN HOUR TO OCEAN'
X_train['total_bedrooms'].fillna(X_train['total_bedrooms'].median(), inplace=True)   

In [None]:
X_train['rooms_per_household'] = X_train['total_rooms'] / X_train['households']
X_train['bedrooms_per_room'] = X_train['total_bedrooms'] / X_train['total_rooms']
X_train['population_per_household'] = X_train['population'] / X_train['households']

In [None]:
X_train.drop(['total_rooms'], axis=1, inplace=True)
X_train.drop(['total_bedrooms'], axis=1, inplace=True)
X_train.drop(['households'], axis=1, inplace=True)

In [None]:
file_path = os.path.join('CA_Counties', 'CA_Counties_TIGER2016.shp')
cali_shp = gpd.read_file(file_path)
gdf = gpd.GeoDataFrame(X_train, geometry=gpd.points_from_xy(X_train['longitude'], X_train['latitude']), crs=cali_shp.crs)
gdf.set_crs(cali_shp.crs, inplace=True)

In [None]:
file_path = os.path.join('CA_Counties', 'CA_Counties_TIGER2016.shp')
cali_shp = gpd.read_file(file_path)
cali_shp = cali_shp.to_crs("EPSG:4326")
gdf = gpd.GeoDataFrame(X_train, geometry=gpd.points_from_xy(X_train['longitude'], X_train['latitude']), crs=cali_shp.crs)
gdf = gdf.to_crs("EPSG:4326")

In [None]:
CityTuple = namedtuple('City', ['Name', 'Latitude', 'Longitude','pop_april_1980', 'pop_april_1990', 'pop_april_2000', 'pop_april_2010'])
city_map = dict()
for index, row in cal_cities.iterrows():
    city_map[row['Name']] = CityTuple(row['Name'], row['Latitude'], row['Longitude'], 0, 0, 0, 0)

In [None]:
for index, row in cal_pops_cities.iterrows():
    if row['City'] in city_map:
        tuple_ = city_map[row['City']]
        tuple_ = tuple_._replace(pop_april_1980=row['pop_april_1980'], pop_april_1990=row['pop_april_1990'], pop_april_2000=row['pop_april_2000'], pop_april_2010=row['pop_april_2010'])
        city_map[row['City']] = tuple_

In [None]:
tuple_list = [tuple_ for tuple_ in city_map.values()]
gdf_cities = gpd.GeoDataFrame(tuple_list, geometry=gpd.points_from_xy([tuple_[2] for tuple_ in tuple_list], [tuple_[1] for tuple_ in tuple_list]), crs=cali_shp.crs)
gdf_cities = gdf_cities.to_crs("EPSG:4326")

In [None]:
gdf_cities[['pop_april_1980','pop_april_1990','pop_april_2000','pop_april_2010']] = gdf_cities[['pop_april_1980','pop_april_1990','pop_april_2000','pop_april_2010']].astype(float)
gdf_cities['Large_City'] = gdf_cities['pop_april_2010'] > 250000
large_cities = gdf_cities[gdf_cities['Large_City'] == True]

In [None]:
large_cities_lat_lon = large_cities[['Name','Latitude', 'Longitude']]
large_cities_dictionaries = large_cities_lat_lon.to_dict('records', into=OrderedDict)

In [None]:
for large_city in large_cities_dictionaries:
    large_city_base_name = large_city['Name']
    large_city_base_name = large_city_base_name.replace(' ', '_').lower()
    large_city_enriched_name = large_city_base_name + '_km_distance'
    gdf[large_city_enriched_name] = gdf.apply(lambda row: geodesic((row['latitude'], row['longitude']), (large_city['Latitude'], large_city['Longitude'])).kilometers, axis=1)

In [None]:
distance_cols = [col for col in gdf.columns if col.endswith('_km_distance')]
distance_data = gdf[distance_cols]
distance_data.head()    
gdf['min_distance'] = distance_data.min(axis=1)
gdf['max_distance'] = distance_data.max(axis=1)
gdf['min_distance_km_col'] = distance_data.idxmin(axis=1)
gdf['max_distance_km_col'] = distance_data.idxmax(axis=1)
gdf.drop(columns=distance_cols, inplace=True)

In [None]:
gdf['min_distance_city_name'] = gdf['min_distance_km_col'].apply(lambda x: x.split('_km_distance')[0].replace('_', ' '))
gdf.drop(columns=['min_distance_km_col', 'max_distance_km_col', 'max_distance'], inplace=True)
gdf.drop(columns=['latitude', 'longitude'], inplace=True)

In [None]:
large_cities = large_cities.rename(columns={'Name': 'city_name_pops'})
large_cities['city_name_pops'] = large_cities['city_name_pops'].apply(lambda x: x.lower())
large_cities['city_name_pops'] = large_cities['city_name_pops'].apply(lambda x: x.replace(' ', '_'))

In [None]:
gdf['min_distance_city_name'] = gdf['min_distance_city_name'].apply(lambda x: x.replace(' ', '_'))

In [None]:
X_train = gdf.merge(large_cities, how='left', left_on='min_distance_city_name', right_on='city_name_pops')
X_train.drop(columns=['city_name_pops', 'Latitude', 'Longitude', 'geometry_y', 'Large_City'], inplace=True)
X_train.rename(columns={'geometry_x': 'geometry'}, inplace=True)
X_train.drop(columns=['geometry'], inplace=True)

In [None]:
X_train[['pop_april_1980', 'pop_april_1990', 'pop_april_2000', 'pop_april_2010']] = X_train[['pop_april_1980', 'pop_april_1990', 'pop_april_2000', 'pop_april_2010']].apply(np.log)

In [None]:
num_processor = Pipeline([("std_scaler", StandardScaler())])

cat_processor = Pipeline(
    [("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    [
        (
            "num",
            num_processor,
            [
                "housing_median_age",
                "population",
                "median_income",
                "rooms_per_household",
                "bedrooms_per_room",
                "population_per_household",
                "min_distance",
                "pop_april_1980",
                "pop_april_1990",
                "pop_april_2000",
                "pop_april_2010",
            ],
        ),
        ("cat", cat_processor, ["min_distance_city_name", "ocean_proximity"]),
    ]
)

In [None]:
X_train.head()

In [None]:
for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE for {model.__class__.__name__}: {rmse}")