In [None]:
import geopandas as gpd
import fiona

gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
geodata = gpd.read_file('chapter 11 data.kml')
geodata.head()


In [None]:
import pandas as pd
apartment_data = pd.read_excel('house_data.xlsx')
apartment_data.head()


In [None]:
apartment_data['Apt ID'] = apartment_data['Apt ID'].apply(lambda x: 'Apt ' + str(x))
apartment_data.head()


In [None]:
merged_data = geodata.merge(apartment_data, left_on='Name', right_on='Apt ID')
merged_data.head()


In [None]:
import contextily as cx

# plotting all data
ax = merged_data.plot(figsize=(15,15), edgecolor='black', facecolor='none')

# adding a contextily basemap
cx.add_basemap(ax, crs=merged_data.crs)


In [None]:
import matplotlib.pyplot as plt
plt.hist(merged_data['Price'])


In [None]:
# if we had no info to segment at all, our best guest would be to predict the mean
merged_data['Price'].mean()


In [None]:
# however we may use additional information to make this estimate more fitting
plt.scatter(merged_data['MaxGuests'], merged_data['Price'])


In [None]:
import numpy as np
np.corrcoef(merged_data['MaxGuests'], merged_data['Price'])


In [None]:
import seaborn as sns
sns.boxplot(x='IncludesBreakfast',y='Price',data=merged_data)


In [None]:
X = merged_data[['IncludesBreakfast', 'MaxGuests']]
y = merged_data['Price']


In [None]:
# first version lets just do a quick and dirty non geo model
from sklearn.linear_model import LinearRegression
lin_reg_1 = LinearRegression()
lin_reg_1.fit(X, y)


In [None]:
print('When no breakfast and 0 Max Guests then price is estimated at: ', lin_reg_1.intercept_)

print('Adding breakfast adds to the price: ', lin_reg_1.coef_[0])

print('Each additional Max Guests adds to the price: ', lin_reg_1.coef_[1])


In [None]:
# Evaluate this model a bit better with train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [None]:
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_train, y_train)


In [None]:
pred_reg_2 = lin_reg_2.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, pred_reg_2)


In [None]:
# add the geo data and see whether it imporves thing
merged_data['long'] = merged_data['geometry'].apply(lambda x: x.x)
merged_data['lat'] = merged_data['geometry'].apply(lambda x: x.y)
merged_data.head()


In [None]:
plt.scatter(merged_data['lat'], merged_data['Price'])

In [None]:
plt.scatter(merged_data['long'], merged_data['Price'])

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(16, 512))
merged_data[['MarkerSize']] = scaler.fit_transform(merged_data[['Price']])


In [None]:
plt.scatter(merged_data['long'], merged_data['lat'], s=merged_data['MarkerSize'], c='none', edgecolors='black')

In [None]:
# add features
X2 = merged_data[['IncludesBreakfast', 'MaxGuests', 'lat', 'long']]
y = merged_data['Price']

# train test split
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.33, random_state=42)

# build the model
lin_reg_3 = LinearRegression()
lin_reg_3.fit(X2_train, y_train)

# evaluate the model
pred_reg_3 = lin_reg_3.predict(X2_test)
print(r2_score(y_test, pred_reg_3))


In [None]:
from sklearn.tree import DecisionTreeRegressor

# build the model
dt_reg_4 = DecisionTreeRegressor()
dt_reg_4.fit(X2_train, y_train)

# evaluate the model
pred_reg_4 = dt_reg_4.predict(X2_test)
print(r2_score(y_test, pred_reg_4))


In [None]:
# tune this model a little bit

for max_depth in range(1,11):

    # build the model
    dt_reg_5 = DecisionTreeRegressor(max_depth=max_depth)
    dt_reg_5.fit(X2_train, y_train)

    # evaluate the model
    pred_reg_5 = dt_reg_5.predict(X2_test)
    print(max_depth, r2_score(y_test, pred_reg_5))


In [None]:
from sklearn import tree

# build the model
dt_reg_5 = DecisionTreeRegressor(max_depth=3)
dt_reg_5.fit(X2_train, y_train)

plt.figure(figsize=(15,15))
tree.plot_tree(dt_reg_5, feature_names=X2_train.columns)
plt.show()
