In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Clean and engineer data

In [None]:
df = pd.read_csv('Berlin.csv')

print(df.shape)
df.head()

In [None]:
free_rentals = list(df[df['price'] == "$0.00"].index)

df = df.drop(index=free_rentals)

print(df.shape)

In [None]:
df['price'] = df['price'].apply(lambda p: float(p.strip('$').replace(",",'')))

df['price'].describe()

In [None]:
def am_to_list(amenities):
    li = amenities.split(",")

    for i in range(len(li)):
        li[i] = li[i].replace('"', '')
        li[i] = li[i].replace("'", '')
        li[i] = li[i].strip("{")
        li[i] = li[i].strip("}")

    return li

df['am_list'] = df['amenities'].apply(am_to_list)

df.head()

In [None]:
potential_features = ['neighbourhood',
                      'neighbourhood_cleansed', 'security_deposit',
                      'room_type', 'accommodates',
                      'bathrooms',
                      'bedrooms']

for feature in potential_features:
    df[feature] = df['am_list'].apply(lambda li: feature in li)
    
df.head()

In [None]:
df['entire'] = df['room_type'] == 'Entire home/apt'
df['private'] = df['room_type'] == 'Private room'
df['shared'] = df['room_type'] == 'Shared room'
df['hotel'] = df['room_type'] == 'Hotel room'

In [None]:
cutoff = 10
top_hoods = df['neighbourhood'].value_counts(dropna=True).index[:cutoff]

for hood in top_hoods:
    df[hood] = df['neighbourhood'] == hood
    
df.head()

In [None]:
features = ['bedrooms', 'bathrooms', 'neighbourhood_cleansed',
            'latitude', 'longitude',
            'room_type', 'cleaning_fee', 'guests_included']

features.extend(top_hoods)

dfX = df[features]
dfy = df['price']

In [None]:
dfX.columns

In [None]:
for feature in dfX.columns:
    dfX[feature] = dfX[feature].fillna(value=dfX[feature].median())
    
dfX.isnull().sum()

In [None]:
dfX.head()

In [None]:
X = np.array(dfX)
y = np.array(dfy)

In [None]:
X.shape, y.shape

# Iterating Models

## First model architecture

In [None]:
model = Sequential()

model.add(Dense(10, input_dim=X.shape[1], activation='relu'))
model.add(Dense(1))

model.compile(loss='MSE', optimizer='adam', metrics=['mean_squared_error'])

model.summary()

In [None]:
model.fit(X, y, epochs=100, verbose=1, validation_split=.2)

In [None]:
np.array([list(X[0])]), model.predict(np.array([list(X[0])]))

In [None]:
def check_predictions(model, y=y, count=10):
  for i in range(count):
    print(f'Predicted: {model.predict(np.array([list(X[i])]))}, actual: {y[i]}')

In [None]:
check_predictions(model)

In [None]:
# first fix: more epochs, small batches

model.fit(X, y, epochs=1000, batch_size=20, verbose=1, validation_split=.2)

In [None]:
check_predictions(model)

## Second model architecture

In [None]:
model = Sequential()

model.add(Dense(5, input_dim=X.shape[1], activation='relu'))
model.add(Dense(1))

model.compile(loss='MSE', optimizer='adam', metrics=['mean_squared_error'])

model.summary()

In [None]:
model.fit(X, y, epochs=100, verbose=1, validation_split=.2)

In [None]:
check_predictions(model)

## Third model architecture 

In [None]:
model = Sequential()

model.add(Dense(15, input_dim=X.shape[1], activation='relu'))
model.add(Dense(7, activation='relu'))
model.add(Dense(1))

model.compile(loss='MSE', optimizer='adam', metrics=['mean_squared_error'])

model.summary()

In [None]:
model.fit(X, y, epochs=100, verbose=1, validation_split=.2)

In [None]:
check_predictions(model)

# Rejigger data to try again

In [None]:
dfX.head()

In [None]:
df['property_type'].value_counts(dropna=False)

In [None]:
df['house'] = df['property_type'] == 'House'
df['apartment'] = df['property_type'] == 'Apartment'
df['condo'] = df['property_type'] == 'Condominium'

df.head()

In [None]:
features = ['bedrooms', 'bathrooms', 'neighbourhood_cleansed',
            'latitude', 'longitude',
            'room_type', 'cleaning_fee', 'guests_included']

dfX = df[features]
dfy = df['price']

for feature in dfX.columns:
    dfX[feature] = dfX[feature].fillna(value=dfX[feature].median())
    
dfX.isnull().sum()

In [None]:
X = np.array(dfX)
y = np.array(dfy)

X.shape, y.shape

# Iterating models again

## First model architecture redux

In [None]:
model = Sequential()

model.add(Dense(10, input_dim=X.shape[1], activation='relu'))
model.add(Dense(1))

model.compile(loss='MSE', optimizer='adam', metrics=['mean_squared_error'])

model.summary()

In [None]:
model.fit(X, y, epochs=100, verbose=1, validation_split=.2)

In [None]:
check_predictions(model)

## Fourth model architecture

In [None]:
model = Sequential()

model.add(Dense(15, input_dim=X.shape[1], activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1))

model.compile(loss='MSE', optimizer='adam', metrics=['mean_squared_error'])

model.fit(X, y, epochs=1000, verbose=1, validation_split=.2)

In [None]:
check_predictions(model)

In [None]:
predictions = model.predict_on_batch(X)

plt.scatter(y, predictions)

In [None]:
dfy.describe()

In [None]:
df[df['price'] > 10000]

In [None]:
df.iloc[10906]['price']