In [1]:
import pandas as pd
import numpy as np
from prepare import prepare_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

## Preprocessing
- split into test/train (and validate if you desire)

- scale numeric data

- feature engineering: create a new feature out of multiple variables, select/reduce variables for features (manually and/or through feature selection methods), bin variables for new features, etc. You will not necessary use all of these methods.

- (Bonus) preprocessing.py: functions to do all the the above, and run functions in notebook.

In [2]:
df = prepare_data()
df

Unnamed: 0,bathroomcnt,bedroomcnt,house_size,latitude,longitude,lotsize,regionidcity,regionidzip,actual_value,logerror,age,tax_rate,estimate,LA,Orange,Ventura
52440,2.0,3.0,1762.0,33937685.0,-117996709.0,6347.0,14634,96171,522000.0,0.007204,62.0,1.210182,5.307310e+05,1,0,0
52439,1.0,3.0,1032.0,34040895.0,-118038169.0,5074.0,36502,96480,49546.0,0.037129,63.0,1.768922,5.396813e+04,1,0,0
52438,2.0,4.0,1612.0,34300140.0,-118706327.0,12105.0,27110,97116,67205.0,0.013209,53.0,1.647913,6.928037e+04,0,0,1
52311,4.0,4.0,2440.0,34009367.0,-118430958.0,5553.0,12447,96047,1550000.0,-0.056152,80.0,1.203548,1.362007e+06,1,0,0
52320,1.0,3.0,1468.0,33794936.0,-118318068.0,8921.0,53027,96217,523019.0,0.043225,68.0,1.242527,5.777532e+05,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1.0,2.0,1465.0,34449266.0,-119281531.0,12647.0,13091,97099,464000.0,0.055619,50.0,1.222517,5.273963e+05,0,0,1
5,1.0,2.0,738.0,34149214.0,-118239357.0,4214.0,45457,96325,218552.0,0.101723,95.0,1.082616,2.762346e+05,1,0,0
3,3.0,4.0,2376.0,34245180.0,-118240722.0,13038.0,396551,96330,145143.0,-0.103410,47.0,1.224661,1.143896e+05,1,0,0
2,2.0,3.0,1243.0,33886168.0,-117823170.0,8432.0,21412,97078,564778.0,0.005383,55.0,1.148823,5.718217e+05,0,1,0


In [3]:
def split_data(df, size = .8):
    train, test = train_test_split(df, train_size = size, random_state=123)
    train, validate = train_test_split(train, train_size = size, random_state = 123)
    return train, validate, test

In [4]:
train, validate, test = split_data(df)
train.shape[0],test.shape[0],validate.shape[0]

(26713, 8349, 6679)

In [5]:
def scale_data(train, validate, test, X):
    scaler = MinMaxScaler()
    scaler.fit(train[X])
    train[X] = scaler.transform(train[X])
    validate[X] = scaler.transform(validate[X])
    test[X] = scaler.transform(test[X])
    return scaler, train, validate, test

In [6]:
X = train.select_dtypes(include = ['float']).columns

In [7]:
scaler, train, validate, test = scale_data(train, validate, test, X)

In [8]:
train.head()

Unnamed: 0,bathroomcnt,bedroomcnt,house_size,latitude,longitude,lotsize,regionidcity,regionidzip,actual_value,logerror,age,tax_rate,estimate,LA,Orange,Ventura
39903,0.363636,0.6,0.239456,0.471304,0.864144,0.422322,24384,96505,0.160012,0.431476,0.321168,0.463051,0.144894,1,0,0
25501,0.454545,0.4,0.285629,0.176322,0.961431,0.273957,12773,96996,0.250003,0.520373,0.167883,0.312492,0.248026,0,1,0
22151,0.545455,0.6,0.504497,0.667007,0.46951,0.956426,12447,96346,0.289395,0.54376,0.343066,0.487864,0.29399,1,0,0
21858,0.363636,0.6,0.23266,0.672082,0.409679,0.273957,27110,97116,0.094642,0.547977,0.321168,0.497586,0.098415,0,0,1
8498,0.363636,0.4,0.269438,0.274316,0.869943,0.307779,27491,97016,0.139095,0.47389,0.343066,0.477787,0.132195,0,1,0
