In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# machine learning
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import  cross_val_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [33]:
def KagleScore(y_real, y_pred):
    '''Kaggle usa RMS entre el log del valor
    predicho y el real'''
    yr = np.log(y_real)
    yp = np.log(y_pred)
    return np.sqrt( np.sum( (yr - yp) ** 2) / yr.shape[0] )


In [23]:
#acquire data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]


train_df.info()

#Data description in percentiles
train_df.describe()
#Distribution of categorical features
train_df.describe(include=['O'])

In [6]:
contnum=list(train_df.select_dtypes(include=['int64','float64']))
catfeat=list(train_df.select_dtypes(exclude=['int64','float64']))
contnum.remove('SalePrice')

#Define columns to be used DEPRECATED
contnum=['LotArea','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','WoodDeckSF','OpenPorchSF','GarageArea']
catfeat=['MSSubClass','MSZoning','Neighborhood','HouseStyle','ExterQual','ExterCond','HeatingQC','CentralAir','SaleCondition']


#Check for Nan
train_df[contnum].isnull().sum()
train_df[catfeat].isnull().sum()


In [7]:
#Elimina de las listas de feats las que tengan mas de 5% de NaN
join = pd.concat([train_df,test_df],axis=0)


for tipo in [contnum,catfeat]:
    lista=[]
    for column in tipo:
        if (join.isnull().sum()[column]>len(join[column].index)*.05):
                lista.append(column)
    
    for item in lista:        tipo.remove(item)
        



In [8]:
#Complete Nan for continuous feats with mean
for dataset in combine:
    for feat in contnum:
        dataset[feat].fillna(value=dataset[feat].mean(),inplace=True)

In [9]:
#Complete Nan for cat features with most common ocurrence
for column in catfeat:
    for dataset in combine:
        freq_port=dataset[column].dropna().mode()[0]
        dataset[column] = dataset[column].fillna(freq_port)


#Encoding
#encode catfeat labels
#pd.concat([train_df[catfeat],test_df[catfeat]])


for feat in catfeat:
    le = preprocessing.LabelEncoder()
    le.fit(pd.concat([train_df[feat],test_df[feat]]))
    for dataset in combine:
            dataset[feat]=le.transform(dataset[feat])

        

In [10]:
#Fix para onehot encoding, encontrar manera de analizar train y test juntos

train_df['HouseStyle'].replace(to_replace='2.5Fin',value='2.5Unf',inplace=True)
test_df['MSSubClass'].replace(to_replace=150,value=40,inplace=True)

#one hot encode Cat features
train_df_dum = pd.get_dummies( train_df[catfeat] )
test_df_dum = pd.get_dummies( test_df[catfeat] )


# get the columns in train that are not in test
col_to_add = np.setdiff1d(train_df_dum.columns, test_df_dum.columns)

# add these columns to test, setting them equal to zero
for c in col_to_add:
    test_df_dum[c] = 0
    

Training

Deje solo RF, DT y SVM ya que son los que dan los score mas altos

In [20]:
X_train = pd.concat([train_df[contnum],train_df_dum],axis=1) #
Y_train = train_df['SalePrice']
X_test  = pd.concat([test_df[contnum],test_df_dum],axis=1) #
X_train.shape,X_test.shape

((1460, 249), (1459, 249))

In [34]:
# Get kaggle score:
KagleScore(Y_train, grid_search.predict(X_train))

0.059140436460601832

In [12]:
# Random Forest
rf_old=RandomForestRegressor()
rf_old.fit(X_train,Y_train)

print '-Old accuracy-'
acc_rf_old = rf_old.score(X_train, Y_train)
acc_rf_old


print '-Old CV score-'
score_old = cross_val_score(rf_old,X_train,Y_train,cv=5)
score_old.mean()

# set of parameters to test
parameters = {"criterion": ["mse", "mae"],
              "n_estimators": [5, 10, 20, 50], #Si pongo mas de 50 me cuelga la pc
              #"max_depth": [None, 50], #Aumenta muchisimo el uso la memoria
              #"min_samples_split": [1, 2, 3],
              #"min_samples_leaf": [1, 2, 3]    
             }

dt=RandomForestRegressor(n_jobs=4)
grid_search = GridSearchCV(dt, parameters, cv=5)
grid_search.fit(X_train, Y_train)
   
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

print '-new accuracy-'
acc_dt = grid_search.score(X_train, Y_train)
acc_dt 


pred_RF = grid_search.predict(X_test)
pred_RF

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

-Old accuracy-


0.96912292995869498

-Old CV score-


0.84006752725012246

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 20, 50], 'criterion': ['mse', 'mae']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Best score: 0.855
Best parameters set:
	criterion: 'mae'
	n_estimators: 20
-new accuracy-


0.97831545736906633

In [13]:
#Tuned RF
rf=RandomForestRegressor(n_jobs=4,criterion='mse',n_estimators=50)
rf.fit(X_train, Y_train)
pred_RF = grid_search.predict(X_test)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [16]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred_svc = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 5)
acc_svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

100.0

In [21]:
#Model evaluation
models = pd.DataFrame({
    'Model': ['Support Vector Machines',  
              'Random Forest', 
              'Decision Tree'],
    'Score': [acc_svc, 
              acc_random_forest, 
              acc_decision_tree]})
print models.sort_values(by='Score', ascending=False)

NameError: name 'acc_random_forest' is not defined

In [19]:
submission = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": pred_RF
    })
submission.to_csv('submissionrf284.csv', index=False)