In [158]:
import pandas as pd
import numpy as np
from math import log
import random

### Helper Functions

In [159]:
def preprocess_df(df):
    for col in df.columns:
        if df[col].unique().dtype == "int64" or df[col].unique().dtype == "float64":
            df[col] = df[col].fillna(df[col].median())
            if df[col].unique().dtype == "float64":
                df[col] = df[col].astype('int64')
        else:
            df[col] = df[col].fillna("NA")
            df[col] = df[col].astype('category')
            
    # Hardcoding numeric to categorical
    df["MSSubClass"] = df["MSSubClass"].astype('category')
    df["OverallQual"] = df["OverallQual"].astype('category')
    df["OverallCond"] = df["OverallCond"].astype('category')
    return df
    
def df_to_numerical(df):
    numerical_df = pd.DataFrame()
    for col in df.columns:
        if df[col].dtype == "int64":
            numerical_df[col] = df[col]
            continue
        else:
            df[col] = df[col].astype("category")
            new_col = '{val}_Cat'.format(val=col)
            numerical_df[new_col] = df[col].cat.codes
    return numerical_df


# Random Forest

In [160]:
class RandomForest():
    def __init__(self):
        self.data = None  # training data set (loaded into memory)
        self.trees = []  # decision trees
    
    def __load_data(self, df):
        self.data = df
        
    def __subsampling(self, train_set, sample_size_ratio):
        sample_number = round(len(self.data) * sample_size_ratio)
        subsample = train_set.sample(sample_number, replace=True)
        return subsample

    def build_model(self, train_set, sample_size_ratio, number_of_trees):
        self.__load_data(train_set)
        for i in range(number_of_trees):
            sample = self.__subsampling(train_set, sample_size_ratio)
            X = sample.iloc[:,:-1]
            y = sample.iloc[:,-1]
            tree = DecisionTree(max_depth=3) # build a tree with sample data and split conditions
            tree.fit(X, y)
            self.trees.append(tree)

    def predict(self, test_set):
        rf_predictions = []
        rf_predictions_mean = []
        for tree in self.trees:
            rf_predictions.append(tree.predict(test_set))
        for i in range(len(rf_predictions)):
            rf_predictions_mean.append(np.mean(rf_predictions[i]))
        return rf_predictions[0]

# Decision Tree

In [161]:
class DecisionNode():
    def __init__(self, feature_idx=None, threshold=None, value=None, true_branch=None, false_branch=None):
        self.feature_idx = feature_idx # index of the feature that is used
        self.threshold = threshold # threshold value for feature when making the decision
        self.value = value # value if the node is a leaf in the tree
        self.true_branch = true_branch # the node we go to if decision returns True
        self.false_branch = false_branch # the node we go to if decision returns False

In [162]:
class DecisionTree():
    def __init__(self, min_info_gain=1e-7, max_depth=float("inf")):
        self.root = None # root of this tree
        self.min_info_gain = min_info_gain # minimum information gain to allow splitting
        self.max_depth = max_depth # maximum depth the tree grows to

    def fit(self, X, y):
        self.root = self.build_tree(X, y)
        
    def build_tree(self, X, y, current_depth=0):
        decision = None
        subtrees = None
        largest_info_gain = 0
        max_variance_reduction = -float('inf')
        df = pd.concat((X, y), axis=1)
        n_rows, n_features = X.shape
        if current_depth <= self.max_depth:
            for feature_idx in range(n_features):
                feature_values = X.iloc[:, feature_idx]
                unique_values = feature_values.unique()
                for threshold in unique_values:
                    X_true, X_false = self.split_by_feature(df, feature_idx, threshold)
                    if len(X_true) > 0 and len(X_false) > 0:
                        y_true = X_true.iloc[:,-1]
                        y_false = X_false.iloc[:,-1]
                        #info_gain = self.calculate_information_gain(y, y_true, y_false)
                        variance_reduction = self.reduce_variance(y,y_true,y_false)
                        #print(variance_reduction)
                        if variance_reduction > max_variance_reduction:
                            max_variance_reduction = variance_reduction
                            #largest_info_gain = info_gain
                            decision = {"feature_idx":feature_idx, "threshold":threshold}
                            subtrees = {"X_true":X_true.iloc[:,:-1],
                                        "y_true":y_true,
                                        "X_false":X_false.iloc[:,:-1],
                                        "y_false":y_false}
        # we will construct new branch if the information gain is larger than minimum information gain that we've defined
        if max_variance_reduction < self.min_info_gain:
            true_branch = self.build_tree(subtrees["X_true"], subtrees["y_true"], current_depth+1)
            false_branch = self.build_tree(subtrees["X_false"], subtrees["y_false"], current_depth+1)
            return DecisionNode(feature_idx=decision["feature_idx"], threshold=decision["threshold"], true_branch=true_branch, false_branch=false_branch)

        return DecisionNode(value=np.mean(y))

    def split_by_feature(self, X, feature_idx, threshold):
        # if the feature is numerical
        if X.iloc[:,feature_idx].dtype == "int64":
            X_true = X[X.iloc[:,feature_idx] >= threshold]
            X_false = X[X.iloc[:,feature_idx] < threshold]
        # if the feature is categorical
        else:
            X_true = X[X.iloc[:,feature_idx] == threshold]
            X_false = X[X.iloc[:,feature_idx] != threshold]
        return X_true, X_false

    def reduce_variance(self, y, y_true, y_false):
        p_true = len(y_true)/len(y)
        p_false = len(y_false)/len(y)
        return np.var(y)-(p_true*np.var(y_true))-(p_false*np.var(y_false))

                
    def predict_value(self, x, tree=None):
        # recursive method to find the leaf node that corresponds to prediction
        if tree is None:
            tree = self.root
        if tree.value is not None:
            return tree.value
        feature_value = x[tree.feature_idx]
        branch = tree.false_branch
        if isinstance(feature_value, int) or isinstance(feature_value, float):
            if feature_value >= tree.threshold:
                branch = tree.true_branch
        elif feature_value == tree.threshold:
            branch = tree.true_branch
        return self.predict_value(x, branch)
    
    def predict(self, X):
        y_pred = []
        for x in X:
            y_pred.append(self.predict_value(x, self.root))
        return y_pred

In [163]:
df = pd.read_csv('housing_price_train.csv')
df = preprocess_df(df)
X_test = pd.read_csv('housing_price_test.csv')
X_test = preprocess_df(X_test)

X_train = df.iloc[:,:-1]
y_train = df.iloc[:,-1]

In [164]:
X_train = df_to_numerical(X_train)
# X_train

In [165]:
rf = RandomForest()
rf.build_model(train_set = X_train, sample_size_ratio=0.3, number_of_trees=2)
rf.predict(X_test)
pd.Series(rf.predict(X_test)).to_csv('samples.csv')

In [166]:
# tree = DecisionTree(max_depth=3)
# tree.fit(X_train, y_train)
# tree.predict(X_test)