In [1]:
import import_ipynb
from Helpers import *
import pandas as pd
import numpy as np
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import cross_val_score, GridSearchCV,cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

importing Jupyter notebook from Helpers.ipynb


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

test_ids = test['Id']

In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

class SumColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns, name, remove=True):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            X[name]=X[columns].sum(axis=1)
            X.drop(columns, axis=1, inplace=True)
            return X
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)





# class HomePreprocessor:
#     def __init__(self, drop_low_correlated=False):
#         self.drop_low_correlated = drop_low_correlated

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, *_):

#         if (drop_low_correlated):
#             X=X.pipe(drop_non_correlated_columns, min_corr=ignoreUncorrelatedRate)
        
#         df = df.pipe(drop_columns, columns=high_nan_columns) # high number of NA values 
#         .pipe(drop_columns, columns=no_variety_columns) # no variety

#         .pipe(fill_NA, columns=columnsToFillNone, value='None') # 
#         .pipe(fill_NA, columns=columnsToFillZero, value=0) # 
#         .pipe(get_dummies_for_category_columns, columns=X.columns[X.dtypes=="object"])
#         .pipe(get_dummies_for_category_columns, columns=categoryColumns2)
#         .pipe(set_type, 'MasVnrArea','int64')
#         .pipe(year_to_age, 'YearBuilt')
#         .pipe(year_to_age, 'YearRemodAdd')
#         .pipe(year_to_age, 'YrSold')
#         .pipe(drop_columns, columns=["Id","BedroomAbvGr"])## drop Garbage
#         .pipe(log_tranform, columns=areaColumns)       
#         return X

# train = preprocess(train).pipe(drop_non_correlated_columns, min_corr=ignoreUncorrelatedRate) 
# test_ids=test['Id']
# test = preprocess(test)


In [4]:

y_train = np.log(train['SalePrice'])
#y_test = np.log(test['SalePrice'])

cs = ColumnSelector(columns=['GrLivArea', 'YearBuilt', 'OverallQual'])

pp = Pipeline(
    [('SelectColumns', cs),])
X_train = pp.fit_transform(train)
X_test = pp.fit_transform(test)



# Perform Grid-Search
gsc = GridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid={
        'max_depth': range(4,10),
        'n_estimators': (10, 50, 100, 1000, 2000),
    },
    cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result = gsc.fit(X_train, y_train)
best_params = grid_result.best_params_
print(best_params)
rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"],
                            random_state=False, verbose=False)


scores = cross_val_score(rfr, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print(scores)

{'max_depth': 8, 'n_estimators': 100}
[-0.03355289 -0.0266471  -0.0353897  -0.03702295 -0.04165031 -0.02817824
 -0.0292269  -0.02705969 -0.03190166 -0.03226198]


In [5]:
#sklearn.metrics.SCORERS.keys()
rfr.fit(X_train, y_train)
predictions = rfr.predict( X_test)

In [6]:
result=np.exp(predictions)

In [7]:
result

array([124153.06923398, 154273.09436089, 155596.32087912, ...,
       138471.38963038, 127944.93178205, 226642.11562243])

In [8]:
dfR = pd.DataFrame(test_ids,columns=['Id'])
dfR['SalePrice']=result
dfR.to_csv('predictions.csv',index = None, header=True)
print(dfR.head())

     Id      SalePrice
0  1461  124153.069234
1  1462  154273.094361
2  1463  155596.320879
3  1464  182177.563836
4  1465  202291.613167


In [9]:
test.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       227
LotArea             0
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         1
TotRmsAbvGrd        0
Functional          2
Fireplaces          0
FireplaceQu       730
GarageType         76
GarageYrBlt        78
GarageFinish       78
GarageCars          1
GarageArea          1
GarageQual         78
GarageCond

In [11]:
test['MiscFeature'].unique()

array([nan, 'Gar2', 'Shed', 'Othr'], dtype=object)