In [106]:
import pandas as pd
import numpy as np

### Helper Functions

In [107]:
def preprocess_df(df):
    for col in df.columns:
        if df[col].unique().dtype == "int64" or df[col].unique().dtype == "float64":
            df[col] = df[col].fillna(df[col].median())
            if df[col].unique().dtype == "float64":
                df[col] = df[col].astype('int64')
        else:
            df[col] = df[col].fillna("NA")
            df[col] = df[col].astype('category')
            
    # Hardcoding numeric to categorical
    df["MSSubClass"] = df["MSSubClass"].astype('category')
    df["OverallQual"] = df["OverallQual"].astype('category')
    df["OverallCond"] = df["OverallCond"].astype('category')
    return df
    
def df_to_numerical(df):
    numerical_df = pd.DataFrame()
    for col in df.columns:
        if df[col].dtype == "int64":
            numerical_df[col] = df[col]
            continue
        else:
            df[col] = df[col].astype("category")
            new_col = '{val}_Cat'.format(val=col)
            numerical_df[new_col] = df[col].cat.codes
    return numerical_df


# Random Forest

In [108]:
class RandomForest():
    def __init__(self):
        self.data = None  # training data set (loaded into memory)
        self.trees = []  # decision trees
    
    def __load_data(self, df):
        self.data = df
        
    def __subsampling(self, train_set, sample_size_ratio):
        sample_number = round(len(self.data) * sample_size_ratio)
        subsample = train_set.sample(sample_number, replace=True)
        return subsample

    def build_model(self, train_set, sample_size_ratio, number_of_trees):
        self.__load_data(train_set)
        for i in range(number_of_trees):
            sample = self.__subsampling(train_set, sample_size_ratio)
            X = sample.iloc[:,:-1]
            y = sample.iloc[:,-1]
            tree = DecisionTree(max_depth=3) # build a tree with sample data and split conditions
            tree.fit(X, y)
            self.trees.append(tree)

    def predict(self, test_set):
        rf_predictions = []
        rf_predictions_mean = []
        for tree in self.trees:
            rf_predictions.append(tree.predict(test_set))
        for i in range(len(rf_predictions)):
            rf_predictions_mean.append(np.mean(rf_predictions[i]))
        return rf_predictions[0]

# Decision Tree

In [109]:
class DecisionNode():
    def __init__(self, feature_idx=None, threshold=None, value=None, true_branch=None, false_branch=None):
        self.feature_idx = feature_idx # index of the feature that is used
        self.threshold = threshold # threshold value for feature when making the decision
        self.value = value # value if the node is a leaf in the tree
        self.true_branch = true_branch # the node we go to if decision returns True
        self.false_branch = false_branch # the node we go to if decision returns False

In [110]:
class DecisionTree():
    def __init__(self, min_mse=1e3, max_depth=10):
        self.root = None # root of this tree
        self.min_mse = min_mse # minimum information gain to allow splitting
        self.max_depth = max_depth # maximum depth the tree grows to

    def fit(self, X, y):
        self.root = self.build_tree(X, y)
        
    def build_tree(self, X, y, current_depth=0):
        decision = None
        subtrees = None
        max_mse = float('inf')
        df = pd.concat((X, y), axis=1)
        n_rows, n_features = X.shape
        if current_depth <= self.max_depth:
            for feature_idx in range(n_features):
                feature_values = X.iloc[:, feature_idx]
                threshold = feature_values.median()
                #print('feature_idx {}\nthreshold_median {}'.format(feature_idx, threshold))
                X_true, X_false = self.split_by_feature(df, feature_idx, threshold)
                if len(X_true) > 0 and len(X_false) > 0:
                    y_true = X_true.iloc[:,-1]
                    y_false = X_false.iloc[:,-1]
                    mse = self.calculate_MSE(y, y_true) + self.calculate_MSE(y, y_false)
                    if max_mse > mse:
                        max_mse = mse
                        decision = {"feature_idx":feature_idx, "threshold":threshold}
                        subtrees = {"X_true":X_true.iloc[:,:-1],
                                    "y_true":y_true,
                                    "X_false":X_false.iloc[:,:-1],
                                    "y_false":y_false}
        if max_mse < self.min_mse:
            true_branch = self.build_tree(subtrees["X_true"], subtrees["y_true"], current_depth+1)
            false_branch = self.build_tree(subtrees["X_false"], subtrees["y_false"], current_depth+1)
            return DecisionNode(feature_idx=decision["feature_idx"], threshold=decision["threshold"], true_branch=true_branch, false_branch=false_branch)

        return DecisionNode(value=np.mean(y))

    def calculate_MSE(self, y, y_pred):
        y = np.array(y)
        y_pred = np.mean(y_pred)
        n = len(y)
        return (1/n)*sum((y - y_pred)**2)
        
    def split_by_feature(self, X, feature_idx, threshold):
        # if the feature is numerical
        #assert (X.iloc[:,feature_idx].dtype == "int64") or (X.iloc[:,feature_idx].dtype == "float64"), "only taking numerical values!"
        X_true = X[X.iloc[:,feature_idx] >= threshold]
        X_false = X[X.iloc[:,feature_idx] < threshold]
        return X_true, X_false


    def predict_value(self, x, tree=None):
        # recursive method to find the leaf node that corresponds to prediction
        if tree is None:
            tree = self.root
        if tree.value is not None:
            return tree.value
        feature_value = x[tree.feature_idx]
        branch = tree.false_branch
        #if isinstance(feature_value, int) or isinstance(feature_value, float):
        if feature_value >= tree.threshold:
            branch = tree.true_branch
        # elif feature_value == tree.threshold:
        #     branch = tree.true_branch
        return self.predict_value(x, branch)
    
    def predict(self, X):
        y_pred = []
        for x in X.iterrows():
            #print("x:",x[0])
            y_pred.append(self.predict_value(x, self.root))
        return y_pred

In [111]:
df = pd.read_csv('housing_price_train.csv')
df = preprocess_df(df)
X_test = pd.read_csv('housing_price_test.csv')
X_test = preprocess_df(X_test)

X_train = df.iloc[:,:-1]
y_train = df.iloc[:,-1]

In [112]:
X_train = df_to_numerical(X_train)
#X_train.dtypes

In [113]:
# rf = RandomForest()
# rf.build_model(train_set = X_train, sample_size_ratio=0.3, number_of_trees=2)
# rf.predict(X_test)
# pd.Series(rf.predict(X_test)).to_csv('samples.csv')

In [114]:
tree = DecisionTree(max_depth=10)
tree.fit(X_train, y_train)
#tree.print_tree()
#tree.predict(X_test)

In [115]:
tree.predict(X_test)

x: 0
x: 1
x: 2
x: 3
x: 4
x: 5
x: 6
x: 7
x: 8
x: 9
x: 10
x: 11
x: 12
x: 13
x: 14
x: 15
x: 16
x: 17
x: 18
x: 19
x: 20
x: 21
x: 22
x: 23
x: 24
x: 25
x: 26
x: 27
x: 28
x: 29
x: 30
x: 31
x: 32
x: 33
x: 34
x: 35
x: 36
x: 37
x: 38
x: 39
x: 40
x: 41
x: 42
x: 43
x: 44
x: 45
x: 46
x: 47
x: 48
x: 49
x: 50
x: 51
x: 52
x: 53
x: 54
x: 55
x: 56
x: 57
x: 58
x: 59
x: 60
x: 61
x: 62
x: 63
x: 64
x: 65
x: 66
x: 67
x: 68
x: 69
x: 70
x: 71
x: 72
x: 73
x: 74
x: 75
x: 76
x: 77
x: 78
x: 79
x: 80
x: 81
x: 82
x: 83
x: 84
x: 85
x: 86
x: 87
x: 88
x: 89
x: 90
x: 91
x: 92
x: 93
x: 94
x: 95
x: 96
x: 97
x: 98
x: 99
x: 100
x: 101
x: 102
x: 103
x: 104
x: 105
x: 106
x: 107
x: 108
x: 109
x: 110
x: 111
x: 112
x: 113
x: 114
x: 115
x: 116
x: 117
x: 118
x: 119
x: 120
x: 121
x: 122
x: 123
x: 124
x: 125
x: 126
x: 127
x: 128
x: 129
x: 130
x: 131
x: 132
x: 133
x: 134
x: 135
x: 136
x: 137
x: 138
x: 139
x: 140
x: 141
x: 142
x: 143
x: 144
x: 145
x: 146
x: 147
x: 148
x: 149
x: 150
x: 151
x: 152
x: 153
x: 154
x: 155
x: 156
x: 157
x: 1

[180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589041095,
 180921.19589