## Implementation of Decision Tree

In [1]:
import numpy as np
from scipy import stats
from sklearn.metrics import r2_score, accuracy_score


class DecisionNode:
    def __init__(self, col, split, lchild, rchild):
        self.col = col
        self.split = split
        self.lchild = lchild
        self.rchild = rchild

    def predict(self, x_test):
        # Make decision based upon x_test[col] and split
        return (
            self.lchild.predict(x_test)
            if x_test[self.col] < self.split
            else self.rchild.predict(x_test)
        )

    def leaf(self, x_test):
        """
        Given a single test record, x_test, return the leaf node reached
        by running it down the tree starting at this node.
        This is just like prediction, except we return the decision tree
        leaf rather than the prediction from that leaf.
        """
        return (
            self.lchild.leaf(x_test)
            if x_test[self.col] < self.split
            else self.rchild.leaf(x_test)
        )


class LeafNode:
    def __init__(self, y, prediction):
        "Create leaf node from y values and prediction; prediction is mean(y) or mode(y)"
        self.y = y
        self.n = len(y)
        self.prediction = prediction

    def predict(self, x_test):
        # return prediction
        return self.prediction

    def leaf(self, x_test):
        """
        Return itself.
        """
        return self


def gini(x):
    """
    Return the gini impurity score for values in y
    Assume y = {0,1}
    Gini = 1 - sum_i p_i^2 where p_i is the proportion of class i in y
    """
    _, class_counts = np.unique(x, return_counts=True)
    n = np.sum(class_counts)

    return 1 - np.sum((class_counts / n) ** 2)


def find_best_split(X, y, loss, min_samples_leaf, max_features):

    # case when this node cannot be further split
    if len(X) <= min_samples_leaf or len(np.unique(X)) == 1:
        return -1, -1

    # record current feature, split and loss
    best_loss = (-1, -1, loss(y))

    k = 11
    for i in np.random.choice(
        np.arange(X.shape[1]), size=int(max_features * X.shape[1]), replace=False
    ):
        # randomly pick k values
        idx_splits = np.random.choice(np.arange(len(X)), size=k)
        candidates = X[idx_splits, i].copy()
        for split in np.unique(candidates):
            yl = y[X[:, i] < split]
            yr = y[X[:, i] >= split]

            if len(yl) <= min_samples_leaf or len(yr) <= min_samples_leaf:
                continue

            total_loss = (len(yl) * loss(yl) + len(yr) *
                          loss(yr)) / (len(yl) + len(yr))
            if total_loss == 0:
                return i, split

            if total_loss < best_loss[-1]:
                best_loss = (i, split, total_loss)

    return best_loss[:-1]


class DecisionTree621:
    def __init__(self, max_features, min_samples_leaf=1, loss=None):
        self.max_features = max_features
        self.min_samples_leaf = min_samples_leaf
        self.loss = loss  # loss function; either np.var for regression or gini for classification

    def fit(self, X, y):
        """
        Create a decision tree fit to (X,y) and save as self.root, the root of
        our decision tree, for  either a classifier or regression.  Leaf nodes for classifiers
        predict the most common class (the mode) and regressions predict the average y
        for observations in that leaf.

        This function is a wrapper around fit_() that just stores the tree in self.root.
        """
        self.root = self.fit_(X, y)

    def fit_(self, X, y):
        """
        Recursively create and return a decision tree fit to (X,y) for
        either a classification or regression.  This function should call self.create_leaf(X,y)
        to create the appropriate leaf node, which will invoke either
        RegressionTree621.create_leaf() or ClassifierTree621.create_leaf() depending
        on the type of self.

        This function is not part of the class "interface" and is for internal use, but it
        embodies the decision tree fitting algorithm.

        (Make sure to call fit_() not fit() recursively.)
        """
        col, split = find_best_split(
            X, y, self.loss, self.min_samples_leaf, self.max_features
        )
        # terminating condition
        if col == -1:
            return self.create_leaf(y)

        XL, yl = X[X[:, col] < split].copy(), y[X[:, col] < split].copy()
        XR, yr = X[X[:, col] >= split].copy(), y[X[:, col] >= split].copy()
        lchild = self.fit_(XL, yl)
        rchild = self.fit_(XR, yr)

        return DecisionNode(col, split, lchild, rchild)

    def predict(self, X_test):
        """
        Make a prediction for each record in X_test and return as array.
        This method is inherited by RegressionTree621 and ClassifierTree621 and
        works for both without modification!
        """
        return np.array([self.root.predict(x_test) for x_test in X_test])


class RegressionTree621(DecisionTree621):
    def __init__(self, max_features, min_samples_leaf=1):
        super().__init__(max_features, min_samples_leaf, loss=np.var)

    def score(self, X_test, y_test):
        "Return the R^2 of y_test vs predictions for each record in X_test"
        return r2_score(y_test, self.predict(X_test))

    def create_leaf(self, y):
        """
        Return a new LeafNode for regression, passing y and mean(y) to
        the LeafNode constructor.
        """
        return LeafNode(y, np.mean(y))


class ClassifierTree621(DecisionTree621):
    def __init__(self, max_features, min_samples_leaf=1):
        super().__init__(max_features, min_samples_leaf, loss=gini)

    def score(self, X_test, y_test):
        "Return the accuracy_score() of y_test vs predictions for each record in X_test"
        return accuracy_score(y_test, self.predict(X_test))

    def create_leaf(self, y):
        """
        Return a new LeafNode for classification, passing y and mode(y) to
        the LeafNode constructor. Feel free to use scipy.stats to use the mode function.
        """
        return LeafNode(y, stats.mode(y)[0][0])


## Implementation of RandomForest

In [2]:
import numpy as np
from sklearn.utils import resample

class RandomForestRegressor621:
    def __init__(
        self, n_estimators=10, min_samples_leaf=3, max_features=0.3, oob_score=False
    ):
        # super().__init__(n_estimators, oob_score=oob_score)
        self.n_estimators = n_estimators
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.oob_score = oob_score
        self.oob_score_ = np.nan
        # each tree is represented by a tuple of (tree object, OOB index)
        self.trees = []

    def fit(self, X, y):
        """
        Given an (X, y) training set, fit all n_estimators trees to different,
        bootstrapped versions of the training data.  Keep track of the indices of
        the OOB records for each tree.  After fitting all of the trees in the forest,
        compute the OOB validation score estimate and store as self.oob_score_, to
        mimic sklearn.
        """
        for _ in range(self.n_estimators):
            bootstrapped_index = resample(range(len(X)), replace=True)
            oob_index = set(range(len(X))).difference(set(bootstrapped_index))

            X_bts, y_bts = X[bootstrapped_index].copy(
            ), y[bootstrapped_index].copy()
            tree = RegressionTree621(self.max_features, self.min_samples_leaf)
            tree.fit(X_bts, y_bts)

            self.trees.append((tree, oob_index))

        if self.oob_score:
            preds = []
            y_true = []
            for i, obs in enumerate(zip(X, y)):
                x_obs, y_obs = obs
                leaves = [
                    tree.root.leaf(x_obs)
                    for tree, oob_index in self.trees
                    if i in oob_index
                ]
                if leaves:
                    y_pred = np.sum(
                        [leaf.n * leaf.prediction for leaf in leaves]
                    ) / np.sum([leaf.n for leaf in leaves])
                    preds.append(y_pred)
                    y_true.append(y_obs)

            self.oob_score_ = r2_score(y_true, preds)

    def predict(self, X_test) -> np.ndarray:
        """
        Given a 2D nxp array with one or more records, compute the weighted average
        prediction from all trees in this forest. Weight each trees prediction by
        the number of observations in the leaf making that prediction.  Return a 1D vector
        with the predictions for each input record of X_test.
        """
        preds = np.zeros(X_test.shape[0])
        for i, x_test in enumerate(X_test):
            leaves = [tree.root.leaf(x_test) for tree, _ in self.trees]
            y_pred = np.sum([leaf.n * leaf.prediction for leaf in leaves]) / np.sum(
                [leaf.n for leaf in leaves]
            )
            preds[i] = y_pred

        return preds

    def score(self, X_test, y_test) -> float:
        """
        Given a 2D nxp X_test array and 1D nx1 y_test array with one or more records,
        collect the prediction for each record and then compute R^2 on that and y_test.
        """
        return r2_score(y_test, self.predict(X_test))


class RandomForestClassifier621:
    def __init__(
        self, n_estimators=10, min_samples_leaf=3, max_features=0.3, oob_score=False
    ):
        # super().__init__(n_estimators, oob_score=oob_score)
        self.n_estimators = n_estimators
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.oob_score = oob_score
        self.oob_score_ = np.nan
        # each tree is represented by a tuple of (tree object, OOB index)
        self.trees = []
        self.n_classes = 0

    def fit(self, X, y):
        """
        Given an (X, y) training set, fit all n_estimators trees to different,
        bootstrapped versions of the training data.  Keep track of the indices of
        the OOB records for each tree.  After fitting all of the trees in the forest,
        compute the OOB validation score estimate and store as self.oob_score_, to
        mimic sklearn.
        """
        self.n_classes = len(np.unique(y))
        for _ in range(self.n_estimators):
            bootstrapped_index = resample(range(len(X)), replace=True)
            oob_index = set(range(len(X))).difference(set(bootstrapped_index))

            X_bts, y_bts = X[bootstrapped_index].copy(
            ), y[bootstrapped_index].copy()
            tree = ClassifierTree621(self.max_features, self.min_samples_leaf)
            tree.fit(X_bts, y_bts)

            self.trees.append((tree, oob_index))

        if self.oob_score:
            preds = []
            y_true = []
            for i, obs in enumerate(zip(X, y)):
                x_obs, y_obs = obs
                leaves = [
                    tree.root.leaf(x_obs)
                    for tree, oob_index in self.trees
                    if i in oob_index
                ]
                if leaves:
                    class_count = np.zeros(self.n_classes, dtype=np.int64)
                    for leaf in leaves:
                        class_, counts = np.unique(leaf.y, return_counts=True)
                        for k, v in zip(class_, counts):
                            class_count[k] += v

                    y_pred = np.argmax(class_count)
                    preds.append(y_pred)
                    y_true.append(y_obs)

            self.oob_score_ = accuracy_score(y_true, preds)

    def predict(self, X_test) -> np.ndarray:
        """
        Given a 2D nxp array with one or more records, compute the weighted average
        prediction from all trees in this forest. Weight each trees prediction by
        the number of observations in the leaf making that prediction.  Return a 1D vector
        with the predictions for each input record of X_test.
        """
        preds = np.zeros(X_test.shape[0])
        for i, x_test in enumerate(X_test):
            leaves = [tree.root.leaf(x_test) for tree, _ in self.trees]
            class_count = np.zeros(self.n_classes, dtype=np.int64)
            for leaf in leaves:
                class_, counts = np.unique(leaf.y, return_counts=True)
                for k, v in zip(class_, counts):
                    class_count[k] += v

            y_pred = np.argmax(class_count)
            preds[i] = y_pred

        return preds

    def score(self, X_test, y_test) -> float:
        """
        Given a 2D nxp X_test array and 1D nx1 y_test array with one or more records,
        collect the prediction for each record and then compute R^2 on that and y_test.
        """
        return accuracy_score(y_test, self.predict(X_test))


## Doing Preprocessing. Here I am converting the 'InvoiceDate' column to datetime . However in my notebook it shows error if I use (format='mixed'). So I have not used this. But in other devices or platforms it is advised to use format='mixed'. I have commented it.

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime


    
#df = pd.read_csv("/kaggle/input/tata-online-retail-dataset/Online Retail Data Set.csv",encoding='unicode_escape')
#print(df.head())
df = pd.read_csv("C:\\Users\\hp\\Desktop\\Online_Retail_Data_Set.csv",encoding='unicode_escape')
df = df.drop_duplicates()
df['Description'] = df['Description'].fillna("Unknown")
df= df.dropna()
#df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'],format='mixed')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df=df[df['UnitPrice'] > 0]
df=df[df['Quantity'] > 0]
df['Hour'] = df['InvoiceDate'].dt.hour
df['Month'] = df['InvoiceDate'].dt.month_name()
df['Day'] = df['InvoiceDate'].dt.day_name()
df['Year']= df['InvoiceDate'].dt.year
df=df[~df['Description'].str.contains('Adjust bad debt',case=False)]


## Removing Outliers

In [4]:
def remove_outlier(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR

    # Create arrays of Boolean values indicating the outlier rows
    idx = df[(df[col] >= upper) | (df[col] <= lower)].index
    df.drop(idx , inplace=True)

## Creating target variable sales by combining Quantity and Unit Price and dropping them because of high correlation and also dropping InvoiceDate because it is parsed and dropping year as it has 2 values.

In [5]:
remove_outlier(df, 'Quantity')
remove_outlier(df, 'UnitPrice')
df['Total'] = df['Quantity']*df['UnitPrice']
df.drop(['Quantity','UnitPrice'],axis=1,inplace=True)
df.drop(['InvoiceDate','Year',],axis=1,inplace=True)
print(df.head())

  InvoiceNo StockCode                          Description  CustomerID  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER     17850.0   
1    536365     71053                  WHITE METAL LANTERN     17850.0   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER     17850.0   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE     17850.0   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.     17850.0   

          Country  Hour    Month      Day  Total  
0  United Kingdom     8  January  Tuesday  15.30  
1  United Kingdom     8  January  Tuesday  20.34  
2  United Kingdom     8  January  Tuesday  22.00  
3  United Kingdom     8  January  Tuesday  20.34  
4  United Kingdom     8  January  Tuesday  20.34  


## Dropping InvoiceNo because it has high number of unique values and little correlation

In [6]:
print(len(df['InvoiceNo'].unique()))
df.drop(['InvoiceNo'],axis=1,inplace=True)

16820


## Dropping StockCode because it has high number of unique values and little correlation

In [7]:
print(len(df['StockCode'].unique()))
df.drop(['StockCode'],axis=1,inplace=True)

3374


## Dropping CustomerID because it has high number of unique values and little correlation

In [8]:
print(len(df['CustomerID'].unique()))
df.drop(['CustomerID'],axis=1,inplace=True)

4190


In [9]:
df.head()

Unnamed: 0,Description,Country,Hour,Month,Day,Total
0,WHITE HANGING HEART T-LIGHT HOLDER,United Kingdom,8,January,Tuesday,15.3
1,WHITE METAL LANTERN,United Kingdom,8,January,Tuesday,20.34
2,CREAM CUPID HEARTS COAT HANGER,United Kingdom,8,January,Tuesday,22.0
3,KNITTED UNION FLAG HOT WATER BOTTLE,United Kingdom,8,January,Tuesday,20.34
4,RED WOOLLY HOTTIE WHITE HEART.,United Kingdom,8,January,Tuesday,20.34


## Dropping Country as United Kingdom has 90% of the values.

In [10]:
print(df['Country'].value_counts())
df.drop(['Country'],axis=1,inplace=True)

United Kingdom          299728
Germany                   7437
France                    6881
EIRE                      5422
Spain                     2038
Belgium                   1657
Switzerland               1432
Portugal                  1252
Norway                     767
Netherlands                609
Italy                      603
Channel Islands            542
Finland                    518
Australia                  510
Cyprus                     475
Austria                    343
Denmark                    297
Poland                     253
Sweden                     249
Unspecified                210
Israel                     188
Iceland                    166
USA                        160
Singapore                  150
Canada                     126
Greece                     125
Japan                      110
Malta                       91
United Arab Emirates        58
European Community          47
RSA                         45
Lebanon                     33
Lithuani

## Converting categorical variable Hour with get_dummies and dropping it

In [11]:
df['Hour'].value_counts()
category_feature=pd.get_dummies(df['Hour'],prefix='H',prefix_sep='_',drop_first=True)
print(category_feature.head())
dfreal1=pd.concat([df,category_feature],axis=1)
print(dfreal1.head())

   H_7  H_8  H_9  H_10  H_11  H_12  H_13  H_14  H_15  H_16  H_17  H_18  H_19  \
0    0    1    0     0     0     0     0     0     0     0     0     0     0   
1    0    1    0     0     0     0     0     0     0     0     0     0     0   
2    0    1    0     0     0     0     0     0     0     0     0     0     0   
3    0    1    0     0     0     0     0     0     0     0     0     0     0   
4    0    1    0     0     0     0     0     0     0     0     0     0     0   

   H_20  
0     0  
1     0  
2     0  
3     0  
4     0  
                           Description  Hour    Month      Day  Total  H_7  \
0   WHITE HANGING HEART T-LIGHT HOLDER     8  January  Tuesday  15.30    0   
1                  WHITE METAL LANTERN     8  January  Tuesday  20.34    0   
2       CREAM CUPID HEARTS COAT HANGER     8  January  Tuesday  22.00    0   
3  KNITTED UNION FLAG HOT WATER BOTTLE     8  January  Tuesday  20.34    0   
4       RED WOOLLY HOTTIE WHITE HEART.     8  January  Tuesday  20.34

In [12]:
dfreal1.head()
dfreal2=dfreal1.drop(['Hour'],axis=1)
print(dfreal2.head())
df=dfreal2.copy()
print(df.columns)

                           Description    Month      Day  Total  H_7  H_8  \
0   WHITE HANGING HEART T-LIGHT HOLDER  January  Tuesday  15.30    0    1   
1                  WHITE METAL LANTERN  January  Tuesday  20.34    0    1   
2       CREAM CUPID HEARTS COAT HANGER  January  Tuesday  22.00    0    1   
3  KNITTED UNION FLAG HOT WATER BOTTLE  January  Tuesday  20.34    0    1   
4       RED WOOLLY HOTTIE WHITE HEART.  January  Tuesday  20.34    0    1   

   H_9  H_10  H_11  H_12  H_13  H_14  H_15  H_16  H_17  H_18  H_19  H_20  
0    0     0     0     0     0     0     0     0     0     0     0     0  
1    0     0     0     0     0     0     0     0     0     0     0     0  
2    0     0     0     0     0     0     0     0     0     0     0     0  
3    0     0     0     0     0     0     0     0     0     0     0     0  
4    0     0     0     0     0     0     0     0     0     0     0     0  
Index(['Description', 'Month', 'Day', 'Total', 'H_7', 'H_8', 'H_9', 'H_10',
       'H_1

## Converting categorical variable Day with get_dummies and dropping it

In [13]:

category_feature=pd.get_dummies(df['Day'],prefix='D',prefix_sep='_',drop_first=True)
print(category_feature.head())
dfreal1=pd.concat([df,category_feature],axis=1)
print(dfreal1.head())

   D_Monday  D_Saturday  D_Sunday  D_Thursday  D_Tuesday  D_Wednesday
0         0           0         0           0          1            0
1         0           0         0           0          1            0
2         0           0         0           0          1            0
3         0           0         0           0          1            0
4         0           0         0           0          1            0
                           Description    Month      Day  Total  H_7  H_8  \
0   WHITE HANGING HEART T-LIGHT HOLDER  January  Tuesday  15.30    0    1   
1                  WHITE METAL LANTERN  January  Tuesday  20.34    0    1   
2       CREAM CUPID HEARTS COAT HANGER  January  Tuesday  22.00    0    1   
3  KNITTED UNION FLAG HOT WATER BOTTLE  January  Tuesday  20.34    0    1   
4       RED WOOLLY HOTTIE WHITE HEART.  January  Tuesday  20.34    0    1   

   H_9  H_10  H_11  H_12  ...  H_17  H_18  H_19  H_20  D_Monday  D_Saturday  \
0    0     0     0     0  ...     0   

In [14]:
dfreal1.head()
dfreal2=dfreal1.drop(['Day'],axis=1)
print(dfreal2.head())
df=dfreal2.copy()
print(df.columns)

                           Description    Month  Total  H_7  H_8  H_9  H_10  \
0   WHITE HANGING HEART T-LIGHT HOLDER  January  15.30    0    1    0     0   
1                  WHITE METAL LANTERN  January  20.34    0    1    0     0   
2       CREAM CUPID HEARTS COAT HANGER  January  22.00    0    1    0     0   
3  KNITTED UNION FLAG HOT WATER BOTTLE  January  20.34    0    1    0     0   
4       RED WOOLLY HOTTIE WHITE HEART.  January  20.34    0    1    0     0   

   H_11  H_12  H_13  ...  H_17  H_18  H_19  H_20  D_Monday  D_Saturday  \
0     0     0     0  ...     0     0     0     0         0           0   
1     0     0     0  ...     0     0     0     0         0           0   
2     0     0     0  ...     0     0     0     0         0           0   
3     0     0     0  ...     0     0     0     0         0           0   
4     0     0     0  ...     0     0     0     0         0           0   

   D_Sunday  D_Thursday  D_Tuesday  D_Wednesday  
0         0           0       

## Converting categorical variable Month with get_dummies and dropping it

In [15]:
category_feature=pd.get_dummies(df['Month'],prefix='M',prefix_sep='_',drop_first=True)
print(category_feature.head())
dfreal1=pd.concat([df,category_feature],axis=1)
print(dfreal1.head())

   M_August  M_December  M_February  M_January  M_July  M_June  M_March  \
0         0           0           0          1       0       0        0   
1         0           0           0          1       0       0        0   
2         0           0           0          1       0       0        0   
3         0           0           0          1       0       0        0   
4         0           0           0          1       0       0        0   

   M_May  M_November  M_October  M_September  
0      0           0          0            0  
1      0           0          0            0  
2      0           0          0            0  
3      0           0          0            0  
4      0           0          0            0  
                           Description    Month  Total  H_7  H_8  H_9  H_10  \
0   WHITE HANGING HEART T-LIGHT HOLDER  January  15.30    0    1    0     0   
1                  WHITE METAL LANTERN  January  20.34    0    1    0     0   
2       CREAM CUPID HEARTS COA

In [16]:
dfreal1.head()
dfreal2=dfreal1.drop(['Month'],axis=1)
print(dfreal2.head())
df=dfreal2.copy()
print(df.columns)

                           Description  Total  H_7  H_8  H_9  H_10  H_11  \
0   WHITE HANGING HEART T-LIGHT HOLDER  15.30    0    1    0     0     0   
1                  WHITE METAL LANTERN  20.34    0    1    0     0     0   
2       CREAM CUPID HEARTS COAT HANGER  22.00    0    1    0     0     0   
3  KNITTED UNION FLAG HOT WATER BOTTLE  20.34    0    1    0     0     0   
4       RED WOOLLY HOTTIE WHITE HEART.  20.34    0    1    0     0     0   

   H_12  H_13  H_14  ...  M_December  M_February  M_January  M_July  M_June  \
0     0     0     0  ...           0           0          1       0       0   
1     0     0     0  ...           0           0          1       0       0   
2     0     0     0  ...           0           0          1       0       0   
3     0     0     0  ...           0           0          1       0       0   
4     0     0     0  ...           0           0          1       0       0   

   M_March  M_May  M_November  M_October  M_September  
0        0  

## Converting categorical variable Description with get_dummies and dropping it

In [17]:
category_feature=pd.get_dummies(df['Description'],prefix='Des',prefix_sep='_',drop_first=True)
print(category_feature.head())
dfreal1=pd.concat([df,category_feature],axis=1)

   Des_ 50'S CHRISTMAS GIFT BAG LARGE  Des_ DOLLY GIRL BEAKER  \
0                                   0                       0   
1                                   0                       0   
2                                   0                       0   
3                                   0                       0   
4                                   0                       0   

   Des_ I LOVE LONDON MINI BACKPACK  Des_ I LOVE LONDON MINI RUCKSACK  \
0                                 0                                 0   
1                                 0                                 0   
2                                 0                                 0   
3                                 0                                 0   
4                                 0                                 0   

   Des_ OVAL WALL MIRROR DIAMANTE   Des_ RED SPOT GIFT BAG LARGE  \
0                                0                             0   
1                                0

In [18]:
dfreal1.head()
dfreal2=dfreal1.drop(['Description'],axis=1)
print(dfreal2.head())
df=dfreal2.copy()
print(df.columns)

   Total  H_7  H_8  H_9  H_10  H_11  H_12  H_13  H_14  H_15  ...  \
0  15.30    0    1    0     0     0     0     0     0     0  ...   
1  20.34    0    1    0     0     0     0     0     0     0  ...   
2  22.00    0    1    0     0     0     0     0     0     0  ...   
3  20.34    0    1    0     0     0     0     0     0     0  ...   
4  20.34    0    1    0     0     0     0     0     0     0  ...   

   Des_ZINC PLANT POT HOLDER  Des_ZINC STAR T-LIGHT HOLDER   \
0                          0                              0   
1                          0                              0   
2                          0                              0   
3                          0                              0   
4                          0                              0   

   Des_ZINC SWEETHEART SOAP DISH  Des_ZINC SWEETHEART WIRE LETTER RACK  \
0                              0                                     0   
1                              0                                

In [19]:
df=df.reset_index()

## The dataset is huge and as a result it is taking a lot of time and resources. If you want to run the code on the full dataset comment out df=df.sample(frac=0.05)

In [20]:
df.head()
df=df.sample(frac=0.05)
df=df.reset_index()
target=df['Total']
df.drop(['Total'],axis=1,inplace=True)

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


In [21]:
df = df.astype(int)

## Split the dataset to train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df,target, test_size=0.3, random_state=42)

  # Output: 0.49244

In [None]:
X_train.drop(columns=['level_0', 'index'], inplace=True)

In [None]:
X_train = X_train.reset_index(drop=True)

X_train = X_train.values
y_train = y_train.values

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test.drop(columns=['level_0', 'index'], inplace=True)
X_test = X_test.reset_index(drop=True)

X_test = X_test.values
y_test = y_test.values

## Fit the custom RandomForest on the dataset

In [None]:
# Create an instance of RandomForestRegressor621
random_forest = RandomForestRegressor621(n_estimators=3, min_samples_leaf=15, max_features=0.3, oob_score=True)

# Fit the model to the training data
random_forest.fit(X_train, y_train)

# Make predictions on new data
predictions = random_forest.predict(X_test)

# Calculate the R^2 score for the predictions
r2_score = random_forest.score(X_test, y_test)
print(r2_score)  

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(df,target, test_size=0.3, random_state=42)

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# sklearn_rf = RandomForestRegressor(n_estimators=3, min_samples_leaf=15, max_features=0.3, oob_score=True)

# # Fit the scikit-learn model to the training data
# sklearn_rf.fit(X_train, y_train)

# # Make predictions using the scikit-learn model
# sklearn_predictions = sklearn_rf.predict(X_test)

# # Calculate the R^2 score for the scikit-learn predictions
# sklearn_r2_score = sklearn_rf.score(X_test, y_test)
# print(sklearn_r2_score) 

In [None]:
# %reset -f

In [None]:
# from IPython.display import clear_output
# clear_output(wait=True)

In [None]:
# import gc
# gc.collect()