# Feature Engineering

#### Import the required libraries

In [4]:
# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_columns', 100)

# Matplotlib for visualization
from matplotlib import pyplot as plt

# display plots in the notebook
%matplotlib inline 

# Seaborn for visualization
import seaborn as sns

#### Import the cleaned dataset

In [5]:
# Load cleaned dataset from the previous lecture
df = pd.read_csv('cleaned_df.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,price,year_sold,property_tax,insurance,beds,baths,sqft,year_built,lot_size,basement,property_type
0,0,295850,2013,234,81,1,1,584,2013,0,,Condo
1,1,216500,2006,169,51,1,1,612,1965,0,1.0,Condo


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1863 entries, 0 to 1881
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1863 non-null   int64  
 1   price          1863 non-null   int64  
 2   property_tax   1863 non-null   int64  
 3   insurance      1863 non-null   int64  
 4   beds           1863 non-null   int64  
 5   baths          1863 non-null   int64  
 6   sqft           1863 non-null   int64  
 7   lot_size       1863 non-null   int64  
 8   basement       1638 non-null   float64
 9   property_type  1863 non-null   object 
 10  popular        1863 non-null   int32  
 11  recession      1863 non-null   int32  
 12  property_age   1863 non-null   int64  
dtypes: float64(1), int32(2), int64(9), object(1)
memory usage: 189.2+ KB


## I. Domain Knowledge

#### A. Popular Properties

2 bedroom and 2 bathroom properties are especially popular for investors. Let's create an indicator variable just for properties with 2 beds and 2 baths.

In [6]:
# Build your code step by step
#((df.beds == 2) & (df.baths == 2))
(df.beds == 2) & (df.baths == 2)

# ((df.beds == 2) & (df.baths == 2)).astype(int)
((df.beds == 2) & (df.baths == 2)).astype(int)


0       0
1       0
2       0
3       0
4       0
       ..
1877    0
1878    0
1879    0
1880    0
1881    0
Length: 1882, dtype: int32

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,property_tax,insurance,beds,baths,sqft,lot_size,basement,property_type,popular,recession,property_age
0,0,295850,234,81,1,1,584,0,,Condo,0,1,0
1,1,216500,169,51,1,1,612,0,1.0,Condo,0,0,41
2,2,279900,216,74,1,1,615,0,,Condo,0,1,49
3,3,379900,265,92,1,1,618,33541,,Condo,0,0,5
4,4,340000,88,30,1,1,634,0,,Condo,0,0,10


In [7]:
# Create indicator variable for properties with 2 beds and 2 baths
df['popular'] = ((df.beds == 2) & (df.baths == 2)).astype(int)

In [33]:
# Check how many propoerties have 2 baths and 2 beds 
df.popular.value_counts()

popular
0    1685
1     178
Name: count, dtype: int64

In [34]:
df[df['popular']==1]

Unnamed: 0.1,Unnamed: 0,price,property_tax,insurance,beds,baths,sqft,lot_size,basement,property_type,popular,recession,property_age
163,164,330000,372,123,2,2,832,1136,1.0,Condo,1,0,66
164,165,250000,375,124,2,2,864,1633,1.0,Condo,1,0,66
165,166,200000,378,131,2,2,877,0,,Condo,1,1,12
166,167,286499,195,59,2,2,886,0,1.0,Condo,1,0,24
167,168,237500,198,60,2,2,886,0,1.0,Condo,1,0,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,337,289000,308,87,2,2,1870,5601,1.0,Bunglow,1,1,10
337,338,740000,720,249,2,2,2100,0,,Condo,1,0,28
338,339,540000,477,158,2,2,2310,3062,1.0,Bunglow,1,1,56
339,340,225000,442,135,2,2,2512,2783,1.0,Condo,1,0,4


#### B. Housing Market Recession

We are modeling housing prices in the United States, it's important to consider the housing market recession around 2008. According to data from Zillow, the lowest housing prices were from 2010 to end of 2013.

<br>
Create an indicator feature **recession**

Here's how:
* Your first condition `year_sold >= 2010`
* Your second condition `year_sold <= 2013`
* Combine the two conditions with an `&` operator
* Convert the resulting data to `int` type.

In [42]:
# Create a new variable recession
df['recession']=((df.year_sold >= 2010) & (df.year_sold <= 2013)).astype(int)

AttributeError: 'DataFrame' object has no attribute 'year_sold'

In [10]:
# Check how many propoerties were sold during recession period 
df.recession.value_counts()

recession
0    1386
1     496
Name: count, dtype: int64

In [11]:
fd = df[['year_sold','recession']]
fd.head(8)

Unnamed: 0,year_sold,recession
0,2013,1
1,2006,0
2,2012,1
3,2005,0
4,2002,0
5,2004,0
6,2011,1
7,2005,0


## II. Interaction Features

In the first step, you engineered features from domain knowledge. interaction features can be products, sums, or differences between two features.

#### A. Property Age

We have the features `year_sold` and the `year_built`. let's create a new feature `property_age`

In [18]:
df['property_age'] = df['year_sold'] - df['year_built']


In [19]:
type(df.property_age)

pandas.core.series.Series

In [20]:
# Create a 'property_age' feature
# Calculate property age
df['property_age'] = df['year_sold'] - df['year_built']


Do a quick sanity check on that feature. Run `df.describe()` and check the stats for the feature `property_age`

In [21]:
# Do you see any error?
df.describe()

Unnamed: 0.1,Unnamed: 0,price,year_sold,property_tax,insurance,beds,baths,sqft,year_built,lot_size,basement,popular,recession,property_age
count,1882.0,1882.0,1882.0,1882.0,1882.0,1882.0,1882.0,1882.0,1882.0,1882.0,1657.0,1882.0,1882.0,1882.0
mean,941.445802,422908.798618,2007.107864,466.903294,140.486716,3.42136,2.580765,2330.171626,1982.981403,12751.197131,1.0,0.09458,0.263549,24.126461
std,543.519771,151473.251553,5.195851,231.653858,72.935379,1.068335,0.945125,1336.926475,20.287099,35304.268897,0.0,0.292712,0.440675,21.153271
min,0.0,200000.0,1993.0,88.0,30.0,1.0,1.0,500.0,1880.0,0.0,1.0,0.0,0.0,-8.0
25%,471.25,300000.0,2004.0,320.0,94.0,3.0,2.0,1346.0,1970.0,1542.0,1.0,0.0,0.0,6.0
50%,941.5,392000.0,2007.0,426.0,125.0,3.0,3.0,1907.5,1986.0,6074.0,1.0,0.0,0.0,20.0
75%,1411.75,525000.0,2011.0,569.0,169.0,4.0,3.0,3005.0,2000.0,11761.0,1.0,0.0,1.0,38.0
max,1882.0,800000.0,2016.0,4508.0,1374.0,5.0,6.0,8450.0,2015.0,436471.0,1.0,1.0,1.0,114.0


In [22]:
# Check number of observations with 'property_age' < 0
(df.property_age < 0).sum()

19

On second thought, this could be an error or that some homeowners buy houses before the construction company builts them. But for the purpose of this project we will remove these observations.

We'll do a quick ad-hoc data cleaning and remove these observations from our dataset.

#### Remove observations where `property_age` is less than 0.
* Keep only observations where `property_age` is 0 and above.

In [23]:
# Print df shape before
print(df.shape)

# Remove rows where property_age is less than 0
df = df[df.property_age >= 0]

# Print number of rows in remaining dataframe
print(df.shape)

(1882, 15)
(1863, 15)


### III. Drop Redundant Features

Beacuse we created a new feature `property_age` using features `year_built` and `year_sold`. We can drop those two features.

**Remove features 'year_built' and 'year_sold'** 
* Use Pandas's `.drop()` function.
* Remember to set `axis=1` because you are dropping columns.
* Remember to do 'inplace=True'.


In [24]:
# Drop 'year_built' and 'year_sold' from the dataset
df.drop(['year_built', 'year_sold'], axis=1, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,price,property_tax,insurance,beds,baths,sqft,lot_size,basement,property_type,popular,recession,property_age
0,0,295850,234,81,1,1,584,0,,Condo,0,1,0
1,1,216500,169,51,1,1,612,0,1.0,Condo,0,0,41
2,2,279900,216,74,1,1,615,0,,Condo,0,1,49
3,3,379900,265,92,1,1,618,33541,,Condo,0,0,5
4,4,340000,88,30,1,1,634,0,,Condo,0,0,10


### Save the final dataset

We will save this dataset and train our model on it.

In [30]:
# Save the data as 'final.csv'
df.to_csv('final.csv', index=None)