**Question for Royce**: How to create a Python package that can be used by others in Jupyter Notebooks (e.g., ames)

# Import libraries 

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

## Set up data path

In [5]:
dataPath = Path('../..') / 'data'

##  Import data into dataframes

In [6]:
prices = pd.read_csv(dataPath / 'Ames_Housing_Price_Data.csv')
real_estate = pd.read_csv(dataPath / 'Ames_Real_Estate_Data.csv', low_memory=False)

## Initial cleaning

In [7]:
prices.drop('Unnamed: 0',axis=1, inplace=True)

In [8]:
prices = prices.loc[~prices.duplicated(),:]

## Exploratory Data Analysis

In [50]:
prices.columns[prices.columns.str.endswith('Area')]

Index(['GrLivArea', 'LotArea', 'MasVnrArea', 'GarageArea', 'PoolArea'], dtype='object')

In [54]:
prices.columns[prices.columns.str.endswith('AbvGr')]

Index(['BedroomAbvGr', 'KitchenAbvGr'], dtype='object')

In [70]:
cols = prices.columns[prices.columns.str.startswith('Overall')]

In [71]:
prices[cols].describe()

Unnamed: 0,OverallQual,OverallCond
count,2579.0,2579.0
mean,6.04653,5.618457
std,1.3677,1.1222
min,1.0,1.0
25%,5.0,5.0
50%,6.0,5.0
75%,7.0,6.0
max,10.0,9.0


In [67]:
prices[bath_cols].describe()

Unnamed: 0,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath
count,2577.0,2577.0,2579.0,2579.0
mean,0.43539,0.062088,1.550989,0.378054
std,0.518857,0.244557,0.545823,0.499184
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0
50%,0.0,0.0,2.0,0.0
75%,1.0,0.0,2.0,1.0
max,3.0,2.0,4.0,2.0


In [79]:
prices[['PavedDrive']].value_counts()

PavedDrive
Y             2335
N              184
P               60
dtype: int64

# Feature Creation

https://www.realtor.com/advice/buy/average-price-per-square-foot-for-a-home/

In [19]:
prices['TotalBsmtSF'].fillna(0, inplace=True)
prices['BsmtUnfSF'].fillna(0, inplace=True)
prices['TotalLivingArea'] = prices['GrLivArea'] + prices['TotalBsmtSF'] - prices['BsmtUnfSF']

In [20]:
prices['TotalLivingArea'] = prices['GrLivArea'] + prices['TotalBsmtSF'] - prices['BsmtUnfSF']
prices['UnusedLotSize'] = prices['LotArea'] - prices['1stFlrSF']

In [21]:
from sklearn.linear_model import LinearRegression
ols = LinearRegression()

In [22]:
X = prices.loc[:,['TotalLivingArea','UnusedLotSize']]
y = prices['SalePrice']

In [23]:
ols.fit(X,y)
ols.score(X,y)

0.6022128029692129

In [33]:
housing=prices.copy()
housing['HasPool'] = (housing['PoolArea']>0)
housing['GarageArea'].fillna(0,inplace=True)

In [34]:
housing['HasPool'].value_counts()

False    2570
True        9
Name: HasPool, dtype: int64

In [35]:
X = housing[['TotalLivingArea','UnusedLotSize','HasPool','OverallQual','GarageArea']]
ols.fit(X,y)
ols.score(X,y)

0.8179437437358524

In [46]:
nbhd_df = pd.get_dummies(housing['Neighborhood'], prefix='Nbhd')

In [47]:
X2 = pd.concat([X, nbhd_df], axis=1)

In [48]:
X2.head()

Unnamed: 0,TotalLivingArea,UnusedLotSize,HasPool,OverallQual,GarageArea,Nbhd_Blmngtn,Nbhd_Blueste,Nbhd_BrDale,Nbhd_BrkSide,Nbhd_ClearCr,...,Nbhd_NoRidge,Nbhd_NridgHt,Nbhd_OldTown,Nbhd_SWISU,Nbhd_Sawyer,Nbhd_SawyerW,Nbhd_Somerst,Nbhd_StoneBr,Nbhd_Timber,Nbhd_Veenker
0,1094.0,7034,False,6,399.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1994.0,3186,False,5,266.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1738.0,5059,False,5,216.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1039.0,7429,False,4,281.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,2308.0,7590,False,8,528.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
X2.drop('Nbhd_Blmngtn',axis=1,inplace=True)
ols.fit(X2,y)
ols.score(X2,y)

0.8560762542745832

In [14]:
import sys
sys.path.append("..") # path the the directory
import config
import model_prep

ModuleNotFoundError: No module named 'ames'