In [6]:
import pandas as pd
import numpy as np

# Import the random forest package
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import cross_validation
from sklearn.cross_validation import KFold
import sklearn.metrics as metric
from scipy.stats import chisquare




In [7]:
def RF(X_train,Y_train,N):
    '''
    Randon Forest via regressor function
    Input:
    x_train = predictor data
    y_train = attribute data (customer value)
    N = number of estimators
    Output:
    alg = the trained RF model (to be applied to the test set)
    Y_hat = predictions based on the train data
    '''
    #Train the algorithm on the train data
    alg = RandomForestRegressor(n_estimators=N, min_samples_split=2, min_samples_leaf=1)
    alg.fit(X_train, Y_train)
    #Use trained algorithm to create prediction of the trian
    Y_hat = alg.predict(X_train)
    
    #Returns the training parameters.
    params = alg.get_params(True)
    print params
    return alg,Y_hat

In [None]:

def run_RF_alg(df,df_test,N):
    '''
    Wrapper for the RF. 
    Input: 
    - df (train)
    - df_test (Test)
    - N (number of trees)
    Output:
    - Y_train (train target values)
    - X_train (train features)
    - X_test (test features)
    - alg (trained values)
    - Y_hat (test target predictions)
    - score (R^2 value)
    - RMSE (RMSE score)
    - chi_score (Chi^2 score)
    - pval (Chi^2 P value)
    '''
    
    #Generate attribute and predictors feature lists
    columns = df.columns
    attribute = columns[-1]
    predictors = columns[:-1]
    
    print predictors

    #Select data by features
    Y_train = df[attribute]
    X_train = df[predictors]
    X_test = df_test[predictors]
    
    #Run Ramdom Forest Model
    alg,Y_hat = RF(X_train,Y_train,N)

    #Score Train prediction
    score = metric.r2_score(Y_train.values,Y_hat)
    RMSE = metric.mean_squared_error(Y_train.values,Y_hat)
    chi_score, pval = chisquare(Y_train.values, Y_hat)
    
    return Y_train,X_train,X_test,alg,Y_hat,score,RMSE,chi_score,pval

In [65]:
def read_data(file_ext):
    '''
    Read in data from a given set based on a string
    input:
    - file_ext (common file extension string)
    Output:
    - df (processed training data)
    - df_test (processed testing data)
    - Id (Id numbers to be used later)
    '''
    
    #Read the pre-prepa
    df = pd.read_csv('data/processed/train_{}.csv'.format(file_ext))
    df = df.drop('Unnamed: 0', 1)
    df = df.fillna(value=0)
    
    #Read the test data set (extract and save the ID column for later)
    df_test = pd.read_csv('data/processed/test_{}.csv'.format(file_ext))
    df_test_ID.rename(columns={'Unnamed: 0':'Id'},inplace=True)
    Id = df_test_ID['Id']
    df_test = df_test.drop('Unnamed: 0', 1)
    df_test = df_test.fillna(value=0)

    return df,df_test,Id

In [66]:
def submission_prep(Id,df_test,sub_title):
    '''
    Prepare the submission file.
    Input: 
    - Id (data frame)
    - df_test (updated test data with target predictions)
    - sub_title (submission file string of the format 'houseprice_{}')
    Output:
    - df_submission (df with submission)
    '''
    #read in test data to 
    SP = df_test['SalePrice']

    df_submission = pd.DataFrame(data=SP)
    df_submission = df_submission.set_index(Id)
    df_submission.to_csv('submissions/houseprice_{}.csv'.format(sub_title))
    
    return df_submission

In [71]:
#Run the Random Forest

#target data file to run
file_ext = 'numeric'

#Read data
df,df_test,Id = read_data(file_ext)

#Run RF
Y_train,X_train,X_test,alg,Y_hat,score,RMSE,chi_score,pval = run_RF_alg(df,df_test,1000)

#Print scores
print 'R-squ:',round(score,3)
print 'Chi-squ:',round(chi_score,-2)
print 'RMSE:',round(np.log(RMSE),0)

#Run predictions
Y_hat_test = alg.predict(X_test)

#Add the prediction to the test data
df_test['SalePrice'] = pd.Series(Y_hat_test, index=df_test.index)

#Prepare submission file
df_submission = submission_prep(Id,df_test,'RF_basic_submission_4')

df_submission.head()

Index([u'LotFrontage', u'LotArea', u'OverallQual', u'OverallCond',
       u'YearBuilt', u'YearRemodAdd', u'MasVnrArea', u'BsmtFinSF1',
       u'BsmtFinSF2', u'BsmtUnfSF', u'TotalBsmtSF', u'1stFlrSF', u'2ndFlrSF',
       u'LowQualFinSF', u'GrLivArea', u'BsmtFullBath', u'BsmtHalfBath',
       u'FullBath', u'HalfBath', u'BedroomAbvGr', u'KitchenAbvGr',
       u'TotRmsAbvGrd', u'Fireplaces', u'GarageYrBlt', u'GarageCars',
       u'GarageArea', u'WoodDeckSF', u'OpenPorchSF', u'EnclosedPorch',
       u'3SsnPorch', u'ScreenPorch', u'PoolArea', u'MiscVal', u'MoSold',
       u'YrSold'],
      dtype='object')
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 1000, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'auto', 'max_depth': None}
R-squ: 0.981
Chi-squ: 743600.0
RMSE: 19.0


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
0,127550.032
1,155330.255
2,182137.897
3,184119.316
4,199027.312


In [None]:
#Run the Random Forest

#target data file to run
file_ext = 'hi_corr'

#Read data
df,df_test,Id = read_data(file_ext)

#Run RF
Y_train,X_train,X_test,alg,Y_hat,score,RMSE,chi_score,pval = run_RF_alg(df,df_test,100000)

#Print scores
print 'R-squ:',round(score,3)
print 'Chi-squ:',round(chi_score,-2)
print 'RMSE:',round(np.log(RMSE),0)

#Run predictions
Y_hat_test = alg.predict(X_test)

#Add the prediction to the test data
df_test['SalePrice'] = pd.Series(Y_hat_test, index=df_test.index)

#Prepare submission file
df_submission = submission_prep(Id,df_test,'RF_Reduced_submission_lrg')

df_submission.head()