In [1]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt

In [2]:
df_price = pd.read_csv("zillow_price.csv")
df_price.head()

FileNotFoundError: [Errno 2] File zillow_price.csv does not exist: 'zillow_price.csv'

In [None]:
df_rent = pd.read_csv("zillow_rent.csv")
df_rent.head()

In [None]:
df_price.shape

In [None]:
df_rent.shape

In [None]:
del df_price['image']

In [None]:
df_price['zip'] = df_price['address'].str.extract(r'(\d{5}\-?\d{0,4})')
df_price.head()

In [None]:
nan_df_price = (df_price[df_price.isna().any(axis=1)])

nan_df_price.head(15)

In [None]:
df_price['price'] = df_price.price.str.replace('$','')
df_price['price'] = df_price.price.str.replace(',','')
df_price['bds'] = df_price.bds.str.replace(',','')
df_price['sqft'] = df_price.sqft.str.replace(',','')
df_price.head()

### to numeric

In [None]:
df_price['price'] = pd.to_numeric(df_price.price, errors='coerce')

df_price['bds'] = pd.to_numeric(df_price.bds, errors='coerce')

df_price['sqft'] = pd.to_numeric(df_price.sqft, errors='coerce')

df_price['ba'] = pd.to_numeric(df_price.ba, errors='coerce')

### create ppsqft

In [None]:
df_price['ppsqft'] = df_price.price /df_price.sqft
df_price.head()

In [None]:
zip_stats = df_price.groupby('zip')['zip'].agg('count').sort_values(ascending=False)
zip_stats.head(20)

### get neighborhood

In [None]:
df_price['area'] = df_price.address.apply(lambda x: x.split(',')[1])
df_price.head()

In [None]:
zip_stats = df_price.groupby('area')['area'].agg('count').sort_values(ascending=False)
zip_stats.head(20)

### Outliers and errors

In [None]:
df_price = df_price[df_price['sqft'].between(df_price['sqft'].quantile(.05), df_price['sqft'].quantile(.95))]

In [None]:
df_price.shape

In [None]:
df_price.head()

In [None]:
import seaborn as sns

ax = sns.regplot(x="price", y="sqft", data=df_price)

### Model

In [None]:
df_price = df_price[df_price['bds'].notna()]
df_price = df_price[df_price['price'].notna()]
df_price = df_price[df_price['ba'].notna()]
df_price = df_price[df_price['sqft'].notna()]

In [None]:
df_price.ba.unique()

In [None]:
df_price.bds.unique()

In [None]:
dum = pd.get_dummies(df_price.area)
dum.head()

In [None]:
df_mod = pd.concat([df_price, dum], axis = 'columns')
df_mod.head()

In [None]:
df_mod = df_mod.drop('address', axis = 'columns')
df_mod = df_mod.drop('ppsqft', axis = 'columns')
df_mod = df_mod.drop('area', axis = 'columns')
df_mod = df_mod.drop('zip', axis = 'columns')
df_mod.head()
df_mod.shape

In [None]:
X = df_mod.drop('price', axis = 'columns')
X.head()

In [None]:
y = df_mod.price
y.head()



In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

In [None]:
np.where(X.columns==' New York')[0][0]

In [None]:
def predict_price(area,bds,ba,sqft):    
    loc_index = np.where(X.columns==area)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = bds
    x[1] = ba
    x[2] = sqft
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

### Test

In [None]:
predict_price(' Brooklyn', 3, 2, 1000)

In [None]:
import pickle
with open('New_York_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))