## Importing libraries

In [None]:
# import sqlalchemy
from google.cloud import bigquery
from sqlalchemy.engine import create_engine
from google.oauth2 import service_account

import numpy as np
import pandas as pd
import re
import geopy.distance

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
if 5>float('inf'):
    print('d')

## Data reading 

In [None]:
credentials_json_path = "credentials_view.json"

credentials = service_account.Credentials.from_service_account_file(
    credentials_json_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

bqclient = bigquery.Client(credentials=credentials, project=credentials.project_id)

In [None]:
# PROJECT_ID = credentials.project_id
# DB_NAME = "house_data"
# DATASET = "london_house_prices"

In [None]:
# Download a table.
table = bigquery.TableReference.from_string(
    "candidate-testing.house_data.london_house_prices"
)
rows = bqclient.list_rows(
    table
)
df = rows.to_dataframe()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# df['address'].value_counts()

In [None]:
# df[df.address=='66, Nansen Road, London, Greater London SW11 5NT']

## Data cleaning and preprocessing

In [None]:
df = df.drop_duplicates().reset_index(drop=True)
df.shape

In [None]:
df.columns

In [None]:
df = df[~((df.latitude==0) & ((df.longitude==0)))]
df.shape

In [None]:
#if everything is same except type we have to drop that rows
df = df.drop_duplicates([col for col in df.columns if col!='type'], keep=False)

#if everything is same except bedrooms we have to drop that rows
df = df.drop_duplicates([col for col in df.columns if col!='bedrooms'], keep=False)

#if everything is same except latitude and longitude we have to drop that rows
df = df.drop_duplicates([col for col in df.columns if col!='latitude' and col!='longitude'])

#if everything is same except area we have to drop that rows
df = df.drop_duplicates([col for col in df.columns if col!='area'], keep=False)

#if everything is same except price we have to drop that rows
df = df.drop_duplicates([col for col in df.columns if col!='price'], keep=False)

#if everything is same except tenure we have to drop that rows
df = df.drop_duplicates([col for col in df.columns if col!='tenure'], keep=False)

#if everything is same except is_newbuild we have to drop that rows
df = df.drop_duplicates([col for col in df.columns if col!='is_newbuild'], keep=False)

In [None]:
df.shape

In [None]:
df_row = df.iloc[5]

df_row

In [None]:
import torch

In [None]:
df_row

In [None]:
df_row.drop('price').values

In [None]:
torch.tensor(float(df_row['price']))

In [None]:
df.sample(1)

## Feature engineering

### Address column

In [None]:
# # df[df['address'].str.contains("'")]
# # df.loc[1383]['address']

# s = "Flat 14, 1, King's Arms Court, London, Greater London E1 1AQ"
# s.lower().replace("'","")

In [None]:
df['address'] = df['address'].apply(lambda x: x.lower().replace("'",""))

In [None]:
sample_sentence_1 = '6, Polworth, london, Greater London SW16 2EU'.lower()
sample_sentence_2 = '6, Polworth Street, London, Greater London SW16 2EU'.lower()

In [None]:
def extract_road(x):
    """This function extracts the road/street/... from input string
    """
    road_matcher = re.compile(r'(\w+\s?\w+),\s?london,')
    res = road_matcher.findall(x)
    if res:
        return res[0]
    return None

print(extract_road(sample_sentence_1))
print(extract_road(sample_sentence_2))

In [None]:
df['road'] = df['address'].apply(extract_road)

In [None]:
df['road'].value_counts()

In [None]:
df = df[~df['road'].isna()].reset_index(drop=True)
df.shape

In [None]:
# uniuqe_ll = df[['latitude','longitude']].drop_duplicates().values
# uniuqe_ll

In [None]:
# k = 0
# for each in uniuqe_ll:
#     res = df[(df.latitude==each[0]) & (df.longitude==each[1])]
#     if res['road'].nunique()>1:
# #         print(res['road'].unique())
# #         print('------')
#         k+=1
        
# #         if k==40:
# #             break

In [None]:
df.head(2)

### Date column

In [None]:
df.date

In [None]:
max(df.date)

In [None]:
min(df.date)

In [None]:
today = pd.to_datetime(np.datetime64('today')).tz_localize(None)

In [None]:
df['date'] = df['date'].dt.tz_localize(None)

In [None]:
df['date_now'] = today

In [None]:
df['days_from_now'] = (df.date_now - df.date).dt.days

In [None]:
df.sample(4)

### Latitude Longitude to kilometers from center

In [None]:
center = [51.509865, -0.118092]
example = [51.51092, 0.06250]
geopy.distance.distance(center, example).km

In [None]:
def distance_from_center(coordinate):
    latitude = float(coordinate.split(',')[0])
    longitude = float(coordinate.split(',')[1])
    center = [51.509865, -0.118092]
    coordinate = [latitude, longitude]
    distance = geopy.distance.distance(center, coordinate).km
    return distance

In [None]:
df['coordinate'] = df['latitude'].astype(str) + ',' + df['longitude'].astype(str)

In [None]:
df['coordinate']

In [None]:
distance_from_center(df.coordinate[0])

In [None]:
df['distance_from_center'] = df.coordinate.apply(distance_from_center)

In [None]:
df.distance_from_center.max()

## Train test split and one_hot_encode

In [None]:
df.columns

In [None]:
data = df.drop(['address','date','date_now','coordinate'],axis=1)

In [None]:
X = data.drop(['price'], axis=1)
y = data['price']

In [None]:
X.shape, y.shape

# Here i have mistake (train one hot then test one hot)

In [None]:
X.head(3)

In [None]:
def one_hot_encode(df, column):
    one_hot = pd.get_dummies(df[column])
    df = df.drop(column, axis = 1)
    df = df.join(one_hot)
    return df

In [None]:
X = one_hot_encode(X, 'type')
X = one_hot_encode(X, 'area')
X = one_hot_encode(X, 'tenure')
X = one_hot_encode(X, 'road')

X.shape

In [None]:
X.head(2)

In [None]:
def normalize(df, col):
    return (df[col] - df[col].mean())/df[col].std()

In [None]:
for col in ['bedrooms', 'latitude', 'longitude', 'days_from_now', 'distance_from_center']:
    X[col] = normalize(X, col)

In [None]:
X.head(2)

In [None]:
# batch generator
def get_batches(X, Y, batch_size):
    n_samples = X.shape[0]
        
    # Shuffle at the start of epoch
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        
        batch_idx = indices[start:end]
    
        yield X[batch_idx], Y[batch_idx]

In [None]:
X.to_csv('X.csv', encoding='utf-8', index=False)

In [None]:
y.to_csv('y.csv', encoding='utf-8', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
(X_train.shape,y_train.shape), (X_val.shape,y_val.shape), (X_test.shape,y_test.shape)

In [None]:
X_train.head(3)

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42)

In [None]:
X_train.shape

In [None]:
clf.fit(X_train, y_train)

In [None]:
import joblib

In [None]:
filename = 'finalized_model.sav'
joblib.dump(clf, filename)