# Random forest

1. Notebook:
   - [RF from scratch](https://www.kaggle.com/code/fareselmenshawii/random-forest-from-scratch?scriptVersionId=138025147)

## Data

In [1]:
import math
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import pandas as pd
import seaborn as sns

### Data EDA

In [2]:
iris = pd.read_csv('./data/Iris.csv')
iris.drop('Id', inplace=True, axis=1)

In [4]:
iris.head().style.background_gradient(cmap=sns.light_palette('seagreen', as_cmap=True))

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
x = iris.iloc[:, :-1]
y = iris.iloc[:, -1]

In [6]:
fig = px.pie(iris, 'Species',color_discrete_sequence=['#3dec84 ','#009688 ','#2E8B57 '],title='Data Distribution',template='plotly')

fig.show()

In [7]:
fig = px.box(data_frame=iris, x='Species',y='PetalLengthCm',color='Species',color_discrete_sequence=['#3dec84 ','#009688 ','#2E8B57 '],orientation='v')
fig.show()

In [8]:
fig = px.scatter(data_frame=iris, x='SepalLengthCm',y='SepalWidthCm'
           ,color='Species',size='PetalLengthCm',template='seaborn',color_discrete_sequence=['#3dec84 ','#009688 ','#2E8B57 '],)

fig.update_layout(width=800, height=600,
                  xaxis=dict(color="#36FF00"),
                 yaxis=dict(color="#36FF00"))
fig.show()


### Data Pre-process

In [10]:
iris['Species'] = iris['Species'].astype('category')
codes = iris['Species'].cat.codes

In [12]:
from sklearn.model_selection import train_test_split
x = iris.iloc[:, :-1]
y = iris.iloc[:, -1]
x_train, x_test, y_train, y_test =train_test_split(x, y, test_size=0.2, random_state=41)


## Model

In [13]:
from sklearn.tree import DecisionTreeClassifier
m = DecisionTreeClassifier()

In [14]:
class RandomForest:
    def __init__(self, n_trees = 7, max_depth=7, min_samples=2):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.trees = []

    def fit(self, x, y):
        self.trees =[]
        dataset = np.concatenate((x, y.reshape(-1,1)), axis=1)

        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples)
            #same for dataset (bootstrapping)
            dataset_sample = self.bootstrap_samples(dataset)
            x_sample, y_sample = dataset_sample[:, :-1], dataset_sample[:, -1]

            tree,fit(x_sample, y_sample)
            self.trees.append(tree)

        return self
    
    def bootstrap_samples(self, dataset):
        n_samples= dataset.shape[0]
        np.random.seed(1)
        idx =  np.randomy.choice(n_samples, n_samples, replace=True)
        dataset_sample = dataset[idx]

        return dataset_sample
    
    def most_common_label(self, y):
        y = list(y)
        most_occ = max(y, key=y.count)
        return most_occ

    def predict(self, x):
        pred = np.array([tree.predict(x) for tree in self.trees])

        #get pred for the same sample from all trees
        preds = np.swapaxes(pred, 0, 1)
        majority_pred = np.array([self.most_common_label(pred) for pred in preds])
        return majority_pred

In [None]:
model = RandomForest(10,10,2)
model.fit(x_train, y_train)

preds = model.predict(x_test)
 
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

In [17]:
dt =  DecisionTreeClassifier()
dt.fit(x_train, y_train)
predictions = dt.predict(x_test) #evaluate the model on the test data
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.9