In [40]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

for dirname, _, filenames in os.walk("./"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./homes_third1.csv
./homes_third2.csv
./.gitignore
./homes.csv
./Project_2.ipynb
./homes_third3.csv
./data_scraping.py
./.ipynb_checkpoints/homes5-checkpoint.csv
./.ipynb_checkpoints/homes2-checkpoint.csv
./.ipynb_checkpoints/homes-checkpoint.csv
./.ipynb_checkpoints/Project_2-checkpoint.ipynb
./.ipynb_checkpoints/homes1-checkpoint.csv
./.ipynb_checkpoints/homes_third2-checkpoint.csv
./.ipynb_checkpoints/homes4-checkpoint.csv
./.ipynb_checkpoints/data_scraping-checkpoint.py
./.ipynb_checkpoints/homes_third3-checkpoint.csv
./.ipynb_checkpoints/homes6-checkpoint.csv
./.ipynb_checkpoints/homes_third1-checkpoint.csv
./.ipynb_checkpoints/untitled-checkpoint.py


# 1. Frame the problem
Using the customer description, Define the problem your trying to solve in your own words (remember this is not technial but must be specific so the customer understands the project

We wish to use past housing data to predict the price of a home based on its attributes, limiting the analysis to a particular city. The city we are using for this is Pheonix, Arizona. We will use attributes from houses listed or sold up to at most two years ago, such as bedrooms, bathrooms, and square footage, as well as their prices, to construct a model that takes in those attributes and returns a predicted price.

# 2. Get the Data 
Define how you recieved the data (provided, gathered..)

In [None]:
'''
The following code was used to divide the data into 3 separate files so that it could be pushed to GitHub:

df = pd.read_csv('homes.csv')
zero_rows = df.iloc[::3]
one_rows = df.iloc[1::3]
two_rows = df.iloc[2::3]
zero_rows.to_csv('homes_third1.csv')
one_rows.to_csv('homes_third2.csv')
two_rows.to_csv('homes_third3.csv')
    # For now, 'homes.csv' is too large to upload to git, so it is split into
    # homes_third1.csv, homes_third2.csv, and homes_third3.csv so it can be pushed
'''

We used the HomeHarvest package to scrape housing data from realtor.com. Our parameters searched for houses sold and listed in the past 2 years, looping over every zip code in Pheonix; the code used to do this is located in data_scraping.py. Before any preprocessing, this means that some entries also include houses not in Pheonix that will likely need to be removed.

# 3. Explore the Data
Gain insights into the data you have from step 2, making sure to identify any bias

In [41]:
df_explore = pd.read_csv('homes.csv')
df_explore = df_explore[df_explore['status'] == 'SOLD']

df_explore['sold_price'] = pd.to_numeric(df_explore['sold_price'], errors='coerce')
df_explore['sqft'] = pd.to_numeric(df_explore['sqft'], errors='coerce')
df_explore['year_built'] = pd.to_numeric(df_explore['year_built'], errors='coerce')
df_explore['beds'] = pd.to_numeric(df_explore['beds'], errors='coerce')

df_explore.dropna(subset=['sold_price'], inplace=True)
df_explore.dropna(subset=['sqft'], inplace=True)
df_explore.dropna(subset=['year_built'], inplace=True)
df_explore.dropna(subset=['beds'], inplace=True)

X_year = df_explore[['year_built']]
X_sqft = df_explore[['sqft']]
X_beds = df_explore[['beds']]
Y = df_explore['sold_price']

model_year = LinearRegression()
model_year.fit(X_year, Y)
model_sqft = LinearRegression()
model_sqft.fit(X_sqft, Y)
model_beds = LinearRegression()
model_beds.fit(X_beds, Y)

print('Attribute', 'Slope', 'Intercept', 'R squared')
print('Year', model_year.coef_[0], model_year.intercept_, model_year.score(X_year, Y))
print('Sqft', model_sqft.coef_[0], model_sqft.intercept_, model_sqft.score(X_sqft, Y))
print('Beds', model_beds.coef_[0], model_beds.intercept_, model_beds.score(X_beds, Y))

Attribute Slope Intercept R squared
Year 3383.292404166018 -6148245.921673668 0.02510728031112608
Sqft 374.8354103231598 -119325.64802457788 0.5066223652122979
Beds 215508.2931265003 -113583.0735900132 0.17671220844156732


We first note that we expect square footage, number of bedrooms, and number of bathrooms to all have a strong positive correlation with price, and we are biased towards considering these attributes in particular. To get a first look at the data, we picked a few attributes and performed a linear regression using that attribute against sell price for sold houses. We picked two of the attributes we suspected would have a strong positive correlation, square footage and bedrooms, as well as the year sold, for which we were unsure if there would be a strong correlation in either direction. The r^2 value for predicting sell price based off of year was 0.025, which is fairly weak, but strong enough that we will likely consider using it for our model. The r^2 values for square footage and bedrooms are 0.507 and 0.177 respectively, both of which indicate correlations of notable significance. The slopes of both of those regressions were also positive, as predicted.

We should also note that these are all crude estimates, seeing as we only used sold houses, dropped NaNs from all attributes concurrently instead of individually, and have not performed the necessary preprocessing to restrict to houses in Pheonix.

# 4.Prepare the Data


Apply any data transformations and explain what and why


In [None]:
df = pd.read_csv('homes.csv')
df = df[df['city'] == 'Phoenix']
    # The data includes some houses not in Pheonix; we wish to ignore those
df = df[df['status'] == 'SOLD']
    # To make it simpler, we will only look at houses that have been sold
df.dropna(subset=['sold_price'], inplace=True) 
df = df[['sold_price', 'style', 'beds', 'full_baths', 'half_baths', 'sqft', 'year_built',
         'price_per_sqft', 'stories', 'hoa_fee', 'parking_garage']]
    # We include all data that we think will be both useful and usable
    # For example, property_id is probably not useful,
    # but text might be useful but is also hard to use
df = pd.get_dummies(df, columns=['style'])
    # We perform one hot encoding for style of home to use the categorical data
df = df.apply(pd.to_numeric, errors='coerce')
    # Convert all data from strings to floats
df['half_baths'] = df['half_baths'].fillna(0)
    # Assume that a listing including no half-bathrooms implies that it has none
print(df)

# 5. Model the data
Using selected ML models, experment with your choices and describe your findings. Finish by selecting a Model to continue with


# 6. Fine Tune the Model

With the select model descibe the steps taken to acheve the best rusults possiable 


# 7. Present
In a customer faceing Document provide summery of finding and detail approach taken


# 8. Launch the Model System
Define your production run code, This should be self susficent and require only your model pramaters 


In [None]:
def infrence(prams):
    results = m.run(prams)
    return results 