In [None]:
import random
import pandas as pd
import numpy as np
import torchvision
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import *
from sklearn.metrics import confusion_matrix, zero_one_loss, mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from torchvision import transforms
from collections import defaultdict
import math

# Section 1: Weighted Random Forest implementation

In the first part, you are requested to implement a variation of Random Forest, which we will call "weighted" random forest (WRF vs. RF).


In [None]:
# TODO 1: implement WRF following the provided class API. You should support both, classification as
# well as regression (the type argument can be either "cat" or "reg"). You should use DecisionTreeClassifier
# and DecisionTreeRegressor as the underlying trees.

class WRF(BaseEstimator):
  def __init__(self, n_trees=100, max_depth=5, n_features=None, type="cat", weight_type="div"):
    
    '''
      init a WRF classifier with the following parameters:
      
      n_trees: the number of trees to use.
      max_depth: the depth of each tree (will be passed along to DecisionTreeClassifier/DecisionTreeRegressor).
      n_features: the number of features to use for every split. The number should be given to DecisionTreeClassifier/Regressor as max_features.
      type: "cat" for categorization and "reg" for regression.
      weight_type: the tree weighting technique. 'div' for 1/error and 'sub' for 1-error.
    '''

    self.n_trees = n_trees
    self.max_depth = max_depth
    self.n_features = n_features
    self.type = type
    self.weight_type = weight_type

  def fit(self, X, y):
    
    '''
      fit the classifier for the data X with response y. 
    '''

    self.trees = []
    self.weights = []
    for n in range(self.n_trees):
      tree = self.build_tree()
      self.trees.append(tree)
      X_tree, y_tree, X_oob, y_oob = self.bootstrap(X, y)
      tree.fit(X_tree, y_tree)
      weight = self.calculate_weight(tree, X_oob, y_oob)
      self.weights.append(weight)

      # Normalizing the weights so they sum to 1
      weights_sum = sum(self.weights)
      self.weights = [weight / weights_sum for weight in self.weights]
      
  def build_tree(self):
    
    tree = None
    
    if self.type == "cat":
      tree = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.n_features)
    else:
      tree = DecisionTreeRegressor(max_depth=self.max_depth, max_features=self.n_features)
    
    return tree
  
  def bootstrap(self, X, y):
    
    '''
      This method creates a bootstrap of the dataset (uniformly sample len(X) samples from X with repetitions).
      It returns X_tree, y_tree, X_oob, y_oob.
      X_tree, y_tree are the bootstrap collections for the given X and y.
      X_oob, y_oob are the out of bag remaining instances (the ones that were not sampled as part of the bootstrap)
    '''

    samples = random.choices(range(len(X)), k=len(X))
    X_tree, y_tree = [X[i] for i in samples], [y[i] for i in samples]
    X_oob, y_oob = [X[i] for i in range(len(X)) if i not in samples], [y[i] for i in range(len(X)) if i not in samples]
    
    return X_tree, y_tree, X_oob, y_oob
    
  def calculate_weight(self, tree, X_oob, y_oob):
    
    '''
      This method calculates a weight for the given tree, based on it's performance on
      the OOB instances. We support two different types:
      if self.weight_type == 'div', we should return 1/error and if it's 'sub' we should
      return 1-error. The error is the normalized error rate of the tree on OOB instances.
      For classification use 0/1 loss error (i.e., count 1 for every wrong OOB instance and divide by the numbner of OOB instances),
      and for regression use mean square error of the OOB instances.
    '''

    y_hat = tree.predict(X_oob)
    error = zero_one_loss(y_oob, y_hat) if self.type == "cat" else mean_squared_error(y_oob, y_hat)

    return 1 / error if self.weight_type == "div" else 1 - error
  
  def predict(self, X):
    
    '''
      Predict the label/value of the given instances in the X matrix.
      For classification you should implement weighted voting, and for regression implement weighted mean.
      Return a list of predictions for the given instances.
    '''

    trees_preds = [tree.predict(X) for tree in self.trees]
    trees_preds = np.array(trees_preds).transpose()  

    wrf_preds = []

    for preds in trees_preds:
      
      if self.type == "cat":
        scores = defaultdict(lambda: 0)
        for i, tree_pred in enumerate(preds):
          scores[tree_pred] += self.weights[i]
        wrf_preds.append(max(scores, key=scores.get))
      
      else:
        scores = [tree_pred * self.weights[i] for i, tree_pred in enumerate(preds)]
        wrf_preds.append(sum(scores)) # The weights sum to 1, so the weighted mean is sum(scores)/1

    return wrf_preds
      

# Section 2: Evaluation

In this section you are requested to evaluate your implementation, and compare it with RandomForestClassifier and RandomForestRegressor.

In [None]:
# TODO 3: Implement a tuning method for your classifier. 
# Note: you could potentially implement WRF as a sklearn classifier and then 
# simply use GridSearchCV inside. For those of you who want to take this route, 
# you are welcome to modify the implementation of WRF accordingly. Check out: https://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

def tune(classifier, X, y, arguments, performance, cv=5):
  
  '''
    This method is doing exactly what GridSearchCV is doing for a sklearn classifier.
    It will run cross validation training with cv folds many times. Each time it will evaluate the CV "performance" on a different
    combination of the given arguments. You should check every combination of the given arguments and return a dictionary with 
    the best argument combination. For classification, "performance" is accuracy. For Regression, "performance" is mean square error.
    
    classifier: it's the WRF classifier to tune
    X, y: the dataset to tune over
    arguments: a dictionary with keys are one of n_trees, max_depth, n_features, weight_type
    and the values are lists of values to test for each argument (see more in GridSearchCV)
  '''

  gcv = GridSearchCV(classifier, arguments, cv=cv, scoring=performance, n_jobs=3)
  gcv.fit(X, y)

  return gcv.best_params_

In [None]:
# TODO 4: Evaluate your implementation and compare it to RandomForestClassifier/Regressor provided by sklearn.

# For classification use the Fashion MNIST, but subsample the dataset to contain only 7K instances (out of the 60K available instances, you may simply select the first 7K instance from the data).
# - Tune both classifiers (WRF and RandomForestClassifier) before evaluation.
# - Evaluate both classifiers on the first 5K instances from the provided test data.
# - Report accuracy, and provide a full confusion matrix for each classifier.

# For regression use the California housing dataset from Kaggle that we used in class:
# https://www.kaggle.com/harrywang/housing#housing.csv

# - Split the dataset to train and test (test_size=0.1, random_state=0)
# - Tune both regressors (WRF and RandomForestRegressor) before evaluation on the training set.
# - Evaluate both regressors on the test set.
# - Report mean square error.


## **Classification - Fashion MNIST**

In [None]:
train_set = torchvision.datasets.FashionMNIST("./data", download=True, transform=transforms.ToTensor())
test_set = torchvision.datasets.FashionMNIST("./data", download=True, train=False, transform=transforms.ToTensor())

X_train, y_train = train_set.data[:7000].view(7000, -1), train_set.targets[:7000]
X_test, y_test = test_set.data[:5000].view(5000, -1), test_set.targets[:5000]

X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw
Processing...
Done!




### Random Forest (RF) Classifier

In [None]:
rf_clf_params = {"n_estimators": [100, 200, 500, 1000], "max_depth": [5, 10, 15, 20], "max_features": [None, 1, 2]}

rf_clf_best_params = tune(RandomForestClassifier(random_state=0), X_train, y_train, rf_clf_params, "accuracy")
print(rf_clf_best_params)



{'max_depth': 20, 'max_features': None, 'n_estimators': 200}


In [None]:
rf_clf = RandomForestClassifier(n_estimators=rf_clf_best_params["n_estimators"],
                                max_depth=rf_clf_best_params["max_depth"],
                                max_features=rf_clf_best_params["max_features"], random_state=0)

rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
rf_clf_preds = rf_clf.predict(X_test)
print(rf_clf_preds)

[9 2 1 ... 9 6 7]


In [None]:
confusion_matrix(y_test, rf_clf_preds)

array([[387,   2,  14,  25,   2,   3,  67,   0,   7,   0],
       [  2, 455,   1,  19,   1,   1,   2,   0,   0,   0],
       [  6,   0, 384,   9,  79,   2,  37,   0,   4,   0],
       [ 17,   2,  10, 428,  20,   1,  17,   0,   5,   0],
       [  3,   0,  68,  27, 396,   2,  24,   0,   1,   0],
       [  0,   0,   0,   0,   0, 449,   0,  27,   2,   7],
       [ 68,   2,  68,  22,  55,   0, 256,   0,  11,   0],
       [  0,   0,   0,   0,   0,  15,   0, 453,   3,  29],
       [  0,   2,   3,   3,   3,   6,   6,   4, 497,   2],
       [  0,   0,   1,   0,   0,   9,   2,  17,   1, 447]])

In [None]:
print('Random Forest Classifier Accuracy: %.2f%%' % (accuracy_score(y_test, rf_clf_preds) * 100))

Random Forest Classifier Accuracy: 83.04%


### Weighted Random Forest (WRF) Classifier

In [None]:
wrf_clf_params = {"n_trees": [100, 200, 500, 1000], "max_depth": [5, 10, 15, 20], "n_features": [None, 1, 2],
                  "weight_type": ["div", "sub"]}

wrf_clf_best_params = tune(WRF(type="cat"), X_train, y_train, wrf_clf_params, "accuracy")

In [None]:
print(wrf_clf_best_params)

{'max_depth': 20, 'n_features': None, 'n_trees': 1000, 'weight_type': 'sub'}


In [None]:
wrf_clf = WRF(n_trees=wrf_clf_best_params["n_trees"], max_depth=wrf_clf_best_params["max_depth"],
              n_features=wrf_clf_best_params["n_features"], type="cat", weight_type=wrf_clf_best_params["weight_type"])

wrf_clf.fit(X_train, y_train)

In [None]:
wrf_clf_preds = wrf_clf.predict(X_test)
print(wrf_clf_preds)

[9, 2, 1, 1, 6, 1, 2, 6, 5, 7, 4, 5, 5, 3, 4, 1, 2, 4, 8, 0, 2, 7, 7, 5, 1, 2, 4, 6, 9, 4, 8, 8, 3, 0, 8, 0, 5, 5, 7, 9, 6, 1, 6, 7, 6, 5, 2, 3, 2, 2, 4, 2, 5, 6, 2, 2, 8, 4, 8, 0, 7, 7, 8, 5, 1, 1, 3, 3, 7, 8, 7, 0, 0, 6, 4, 3, 1, 2, 8, 4, 1, 8, 5, 9, 5, 0, 3, 2, 0, 6, 5, 3, 6, 7, 1, 8, 0, 3, 2, 2, 3, 6, 7, 2, 7, 8, 5, 9, 9, 4, 2, 5, 7, 3, 5, 4, 8, 6, 7, 2, 0, 0, 9, 9, 3, 0, 6, 4, 1, 5, 4, 1, 9, 1, 8, 4, 2, 1, 2, 5, 1, 3, 0, 0, 1, 6, 1, 3, 6, 6, 4, 4, 1, 3, 5, 6, 4, 7, 9, 3, 7, 2, 3, 9, 0, 9, 6, 7, 4, 2, 6, 5, 0, 1, 2, 1, 3, 0, 9, 1, 0, 9, 3, 6, 7, 9, 9, 4, 4, 7, 1, 2, 1, 2, 3, 2, 8, 8, 6, 1, 1, 0, 2, 9, 2, 4, 0, 7, 9, 8, 4, 1, 8, 4, 1, 3, 0, 3, 7, 4, 8, 8, 2, 0, 7, 7, 6, 2, 7, 2, 7, 8, 9, 2, 9, 0, 5, 1, 4, 2, 5, 2, 9, 2, 2, 8, 6, 8, 2, 4, 9, 7, 0, 5, 5, 2, 8, 5, 6, 3, 0, 4, 8, 0, 0, 6, 3, 8, 9, 6, 1, 3, 0, 2, 3, 0, 8, 3, 7, 2, 0, 1, 2, 3, 0, 4, 3, 7, 5, 3, 7, 9, 5, 3, 5, 5, 1, 9, 8, 8, 3, 3, 4, 8, 0, 0, 4, 9, 5, 9, 1, 6, 4, 2, 5, 6, 7, 1, 6, 4, 5, 2, 6, 5, 4, 2, 7, 7, 7, 3, 3, 5, 6, 

In [None]:
confusion_matrix(y_test, wrf_clf_preds)

array([[338,   2,  12,  40,   8,   2,  93,   0,  12,   0],
       [  2, 450,   2,  21,   1,   0,   3,   0,   1,   1],
       [ 15,   0, 334,  16,  81,   3,  61,   0,  11,   0],
       [ 18,  13,  11, 400,  23,   3,  28,   0,   4,   0],
       [  5,   8,  98,  27, 327,   2,  48,   0,   6,   0],
       [  1,   3,   0,   0,   0, 430,   0,  33,   4,  14],
       [ 69,   1,  62,  30,  68,   2, 239,   1,  10,   0],
       [  0,   0,   0,   0,   0,  30,   0, 431,   2,  37],
       [  7,   4,  11,   2,   3,   9,  13,   6, 466,   5],
       [  0,   0,   1,   0,   0,  11,   1,  22,   5, 437]])

In [None]:
print('Weighted Random Forest Classifier Accuracy: %.2f%%' % (accuracy_score(y_test, wrf_clf_preds) * 100))

Weighted Random Forest Classifier Accuracy: 77.04%


## **Regression - California Housing dataset**

In [None]:
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Practical topics in Machine Learning/EX3'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Practical topics in Machine Learning/EX3


In [None]:
data = pd.read_csv('./housing.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [None]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# Data preparation
y = data["median_house_value"]
X = data.drop("median_house_value", axis=1)

# Although categorical attributes are allowed in DTs, sklearn version supports only float values.
# Therefore, we should encode categorical attributes as one hot.
X_oh = pd.concat([X, pd.get_dummies(X['ocean_proximity'])], axis=1)
X_oh = X_oh.drop("ocean_proximity", axis=1)

# Impute missing data
X_oh = X_oh.apply(lambda x: x.fillna(x.mean()))

X_oh = X_oh.to_numpy()
y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X_oh, y, test_size=0.1, random_state=0)

### Random Forest (RF) Regressor

In [None]:
rf_reg_params = {"n_estimators": [100, 200, 500, 1000], "max_depth": [5, 10, 15, 20], "max_features": [None, 1, 2]}
rf_reg_best_params = tune(RandomForestRegressor(random_state=0), X_train, y_train, rf_reg_params,
                          "neg_mean_squared_error")

print(rf_reg_best_params)



{'max_depth': 20, 'max_features': None, 'n_estimators': 1000}


In [None]:
rf_reg = RandomForestRegressor(n_estimators=rf_reg_best_params["n_estimators"],
                               max_depth=rf_reg_best_params["max_depth"],
                               max_features=rf_reg_best_params["max_features"], random_state=0)

rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features=None, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [None]:
rf_reg_preds = rf_reg.predict(X_test)
print(rf_reg_preds)

[162478.93291667 246179.56214239 143254.46170765 ... 239798.51807956
 292349.37635389 162972.32155594]


In [None]:
print("Random Forest Regressor MSE: {}".format('%.3f' % mean_squared_error(y_test, rf_reg_preds)))

Random Forest Regressor MSE: 2074099431.577


### Weighted Random Forest (WRF) Regressor

In [None]:
wrf_reg_params = {"n_trees": [100, 200, 500, 1000], "max_depth": [5, 10, 15, 20], "n_features": [None, 1, 2],
                  "weight_type": ["div", "sub"]}
                  
wrf_reg_best_params = tune(WRF(type="reg"), X_train, y_train, wrf_reg_params, "neg_mean_squared_error")

In [None]:
print(wrf_reg_best_params)

{'max_depth': 20, 'n_features': None, 'n_trees': 1000, 'weight_type': 'sub'}


In [None]:
wrf_reg = WRF(n_trees=wrf_reg_best_params["n_trees"], max_depth=wrf_reg_best_params["max_depth"],
                 n_features=wrf_reg_best_params["n_features"], type="reg", weight_type=wrf_reg_best_params["weight_type"])

wrf_reg.fit(X_train, y_train)

In [None]:
wrf_reg_preds = wrf_reg.predict(X_test)
print(wrf_reg_preds)

[134799.9999655601, 267700.0000011405, 135118.1818163118, 67499.99999413443, 199999.99997503054, 101299.9999906721, 315499.999974888, 315299.99999545823, 262099.999981575, 255899.99998364563, 332766.66668301425, 59599.99999672098, 82557.89473923678, 101299.99998753564, 161299.99999745417, 359899.99997146614, 392200.00000759674, 97499.99999894093, 135392.85713858795, 361599.99998661916, 131099.9999991446, 57082.50000047477, 231999.99999463905, 259952.27273223246, 183174.99999691956, 169200.00000460283, 177899.99999794297, 166699.9999827617, 90599.99999775094, 57082.50000003038, 500000.99999999994, 500001.00001097773, 500001.00003395806, 167599.9999992668, 92700.0000056008, 110087.49999339867, 177500.00000525458, 124502.38095079284, 366700.00002863025, 116999.99999010183, 166300.0000155397, 74599.99999507128, 72649.9999985336, 270717.64708096563, 160599.99999396876, 82557.89473809625, 248200.00003103865, 135999.999994277, 165333.33333096586, 179499.99994490837, 294399.99999063136, 110300

In [None]:
print("Weighted Random Forest Regressor MSE: {}".format('%.3f' % mean_squared_error(y_test, wrf_reg_preds)))

Weighted Random Forest Regressor MSE: 4240806657.520


> For conclusion, we can see that overall the Random Forest model (as both Classifier and Regressor) provided by sklearn is better than our Weighted Random Forest model.


