In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

%matplotlib inline

In [7]:
data = pd.read_csv('AmesHousing.tsv', delimiter = '\t')
# https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt 
data[:5]

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [9]:
def transform_features(data, cutoff = 0.1):
    # remove columns if percent of NaN values is more than cutoff parametr
    missing_values = data.isnull().sum()
    data = data[missing_values[missing_values < cutoff*len(data)].index]
    
    # choose numeric and text columns
    numeric_cols = data._get_numeric_data().columns.tolist()
    text_cols    = data.select_dtypes(include=['object']).columns.tolist() 
    
    # replace text variables to numeric (one hot encoding)
    for col in text_cols:
        col_dummies = pd.get_dummies(data[col])
        data = pd.concat([data, col_dummies], axis = 1)
        del data[col]
    
    
    numeric_fill_cols = [x for x in numeric_cols if x[0] in list(missing_values[missing_values < cutoff*len(data)].index)] 
    data[numeric_fill_cols] = data[numeric_fill_cols].apply(lambda x:x.fillna(x.value_counts().index[0]))
   
    numeric_fill_cols_mean = [item for item in numeric_cols if item not in numeric_fill_cols]
    data[numeric_fill_cols_mean] = data[numeric_fill_cols_mean].fillna(data[numeric_fill_cols_mean].mean())
    
    return data

In [None]:
# https://machinelearningmastery.com/feature-selection-machine-learning-python/
def select_features(data):
    return data

In [None]:
def train_and_test(data, target = 'SalePrice', n_folds = 5):
    return rmse

In [12]:
data['years_until_remod'] = data['Year Remod/Add'] - data['Year Built']
data.drop(['Year Remod/Add', 'Year Built'], axis=1, inplace=True)

In [15]:
data._get_numeric_data().corr()['SalePrice'].sort_values()

PID                 -0.246521
years_until_remod   -0.240168
Enclosed Porch      -0.128787
Kitchen AbvGr       -0.119814
Overall Cond        -0.101697
MS SubClass         -0.085092
Low Qual Fin SF     -0.037660
Bsmt Half Bath      -0.035835
Order               -0.031408
Yr Sold             -0.030569
Misc Val            -0.015691
BsmtFin SF 2         0.005891
3Ssn Porch           0.032225
Mo Sold              0.035259
Pool Area            0.068403
Screen Porch         0.112151
Bedroom AbvGr        0.143913
Bsmt Unf SF          0.182855
Lot Area             0.266549
2nd Flr SF           0.269373
Bsmt Full Bath       0.276050
Half Bath            0.285056
Open Porch SF        0.312951
Wood Deck SF         0.327143
Lot Frontage         0.357318
BsmtFin SF 1         0.432914
Fireplaces           0.474558
TotRms AbvGrd        0.495474
Mas Vnr Area         0.508285
Garage Yr Blt        0.526965
Full Bath            0.545604
1st Flr SF           0.621676
Total Bsmt SF        0.632280
Garage Are

* Generate a correlation heatmap matrix of the numerical features in the training data set.
    * Which features correlate strongly with our target column, SalePrice?
    * Calculate the correlation coefficients for the columns that seem to correlate well with SalePrice. Because we have a pipeline in place, it's easy to try different features and see which features result in a better cross validation score.
* Which columns in the data frame should be converted to the categorical data type? All of the columns marked as nominal from the documentation are candidates for being converted to categorical. Here are some other things you should think about:
    * If a categorical column has hundreds of unique values (or categories), should you keep it? When you dummy code this column, hundreds of columns will need to be added back to the data frame.
    * Which categorical columns have a few unique values but more than 95% of the values in the column belong to a specific category? This would be similar to a low variance numerical feature (no variability in the data for the model to capture).
* Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?
* What are some ways we can explore which categorical columns "correlate" well with SalePrice?
    * Read this post for some potential strategies.
* Update the logic for the select_features() function. This function should take in the new, modified train and test data frames that were returned from transform_features().

* The optional k parameter should accept integer values, with a default value of 0.
* When k equals 0, perform holdout validation (what we already implemented):
    * Select the first 1460 rows and assign to train.
    * Select the remaining rows and assign to test.
    * Train on train and test on test.
    * Compute the RMSE and return.
* When k equals 1, perform simple cross validation:
    * Shuffle the ordering of the rows in the data frame.
    * Select the first 1460 rows and assign to fold_one.
    * Select the remaining rows and assign to fold_two.
    * Train on fold_one and test on fold_two.
    * Train on fold_two and test on fold_one.
    * Compute the average RMSE and return.
* When k is greater than 0, implement k-fold cross validation using k folds:
    * Perform k-fold cross validation using k folds.
    * Calculate the average RMSE value and return this value.

Example on [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/kernels)