In [124]:
import pandas as pd
import numpy as np

### Helper Functions

In [125]:
def preprocess_df(df):
    for col in df.columns:
        if df[col].unique().dtype == "int64" or df[col].unique().dtype == "float64":
            df[col] = df[col].fillna(df[col].median())
            if df[col].unique().dtype == "float64":
                df[col] = df[col].astype('int64')
        else:
            df[col] = df[col].fillna("NA")
            df[col] = df[col].astype('category')
            
    # Hardcoding numeric to categorical
    df["MSSubClass"] = df["MSSubClass"].astype('category')
    df["OverallQual"] = df["OverallQual"].astype('category')
    df["OverallCond"] = df["OverallCond"].astype('category')
    return df
    
def df_to_numerical(df):
    numerical_df = pd.DataFrame()
    for col in df.columns:
        if df[col].dtype == "int64":
            numerical_df[col] = df[col]
            continue
        else:
            df[col] = df[col].astype("category")
            new_col = '{val}_Cat'.format(val=col)
            numerical_df[new_col] = df[col].cat.codes
    return numerical_df


# Random Forest

# Decision Tree

In [126]:
class DecisionNode():
    def __init__(self, feature_idx=None, threshold=None, value=None, true_branch=None, false_branch=None):
        self.feature_idx = feature_idx # index of the feature that is used
        self.threshold = threshold # threshold value for feature when making the decision
        self.value = value # value if the node is a leaf in the tree
        self.true_branch = true_branch # the node we go to if decision returns True
        self.false_branch = false_branch # the node we go to if decision returns False

In [127]:
class DecisionTree():
    def __init__(self, min_mse=float('inf'), max_depth=10):
        self.root = None # root of this tree
        self.min_mse = min_mse # minimum information gain to allow splitting
        self.max_depth = max_depth # maximum depth the tree grows to

    def fit(self, X, y):
        self.root = self.build_tree(X, y)
        
    def build_tree(self, X, y, current_depth=0):
        decision = None
        subtrees = None
        self.min_mse = self.calculate_MSE(y, np.mean(y))
        
        df = pd.concat((X, y), axis=1)
        n_rows, n_features = X.shape
        #print('Nfeatures:',n_features)
        if current_depth <= self.max_depth:
            max_mse = self.min_mse
            for feature_idx in range(n_features):
                feature_values = X.iloc[:, feature_idx]
                threshold = feature_values.median()
                #print(threshold)
                X_true, X_false = self.split_by_feature(df, feature_idx, threshold)
                if len(X_true) > 0 and len(X_false) > 0:
                    y_true = X_true.iloc[:,-1]
                    y_false = X_false.iloc[:,-1]
                    left_mean = np.mean(y_true)
                    right_mean = np.mean(y_false)
                    
                    res_l = y_true - left_mean
                    res_r = y_false - right_mean
                    
                    r = np.concatenate((res_l, res_r), axis=None)
                    r = r**2
                    mse = np.mean(r)
                    #print("mse:", mse)
                    #mse = self.calculate_MSE(y, y_true) + self.calculate_MSE(y, y_false)
                    if max_mse > mse:
                        max_mse = mse
                        decision = {"feature_idx":feature_idx, "threshold":threshold}
                        subtrees = {"X_true":X_true.iloc[:,:-1],
                                    "y_true":y_true,
                                    "X_false":X_false.iloc[:,:-1],
                                    "y_false":y_false}
            if max_mse < self.min_mse:
                true_branch = self.build_tree(subtrees["X_true"], subtrees["y_true"], current_depth+1)
                #print("building a new branch")
                false_branch = self.build_tree(subtrees["X_false"], subtrees["y_false"], current_depth+1)
                return DecisionNode(feature_idx=decision["feature_idx"], threshold=decision["threshold"], true_branch=true_branch, false_branch=false_branch)

        return DecisionNode(value=np.mean(y))

    def calculate_MSE(self, y, y_pred):
        y = np.array(y)
        y_pred = np.mean(y_pred)
        n = len(y)
        return (1/n)*sum((y - y_pred)**2)
        
    def split_by_feature(self, X, feature_idx, threshold):
        X_true = X[X.iloc[:,feature_idx] >= threshold]
        X_false = X[X.iloc[:,feature_idx] < threshold]
        return X_true, X_false
        
    def predict_value(self, x, tree=None):
        if tree is None:
            tree = self.root
        if tree.value is not None:
            return tree.value
        feature_value = x[tree.feature_idx]
        branch = tree.false_branch
        if feature_value >= tree.threshold:
            branch = tree.true_branch
        return self.predict_value(x, branch)
    
    def predict(self, X):
        y_pred = []
        ids = []
        for idx, row in X.iterrows():
            y_pred.append(self.predict_value(row, self.root))
            ids.append(idx)
        return ids, y_pred

In [128]:
class RandomForest():
    def __init__(self):
        self.data = None  # training data set (loaded into memory)
        self.trees = []  # decision trees
    
    def __load_data(self, df):
        self.data = df
        
    def __subsampling(self, train_set, sample_size_ratio):
        sample_number = round(len(self.data) * sample_size_ratio)
        subsample = train_set.sample(sample_number, replace=True)
        return subsample

    def build_model(self, train_set, sample_size_ratio, number_of_trees, max_depth=5):
        self.__load_data(train_set)
        for i in range(number_of_trees):
            sample = self.__subsampling(train_set, sample_size_ratio)
            #print("sample:\n",sample.iloc[:,-1])
            X = sample.iloc[:,:-1]
            y = sample.iloc[:,-1]
            tree = DecisionTree(max_depth) # build a tree with sample data and split conditions
            tree.fit(X, y)
            self.trees.append(tree)

    def predict(self, test_set):
        rf_predictions = []
        for tree in self.trees:
            ids, rf_prediction = tree.predict(test_set)
            rf_predictions.append(rf_prediction)
        for prediction in rf_predictions:
            arrays = [np.array(prediction)]
        mean_values = [int(np.mean(i)) for i in zip(*arrays)]
        return ids, mean_values

In [129]:
df = pd.read_csv('housing_price_train.csv', index_col='Id')
df = preprocess_df(df)
X_test = pd.read_csv('housing_price_test.csv', index_col='Id')
X_test = preprocess_df(X_test)


In [130]:
train_df = df_to_numerical(df)
test_df = df_to_numerical(X_test)

In [131]:
rf = RandomForest()
rf.build_model(train_set = train_df, sample_size_ratio=0.3, number_of_trees=2, max_depth=2)
ids, mean_values = rf.predict(test_df)

In [132]:
data = {"Id":ids, "SalePrice":mean_values}
output = pd.DataFrame(data).set_index('Id')
output.to_csv('samples.csv')