In [23]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor  # model


In [3]:
df = pd.read_csv("winequality-red.csv")

In [4]:
print(df.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [11]:
def scale(X):
    """
    Standardizes the data in the array X.

    Parameters:
        X (numpy.ndarray): Features array of shape (n_samples, n_features).

    Returns:
        numpy.ndarray: The standardized features array.
    """
    # Calculate the mean and standard deviation of each feature
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)

    # Standardize the data
    X = (X - mean) / std

    return X

In [12]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values.reshape(-1,1)
X = scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 80)

In [13]:
print(X_train), print(y_train)

[[-0.01128221  0.48702616 -0.26176211 ... -0.33117661 -0.28414114
  -1.05411336]
 [ 0.56324823  0.71048523 -0.05636026 ... -0.39596939 -1.4053896
  -0.77251161]
 [-0.47090657 -0.15541864  0.45714436 ...  0.25195842  0.83710732
   0.07229363]
 ...
 [-0.98798397  0.45909378 -1.34012182 ...  2.45491298  0.42401579
   0.07229363]
 [-0.87307788  0.68255284 -1.28877135 ...  0.25195842 -0.22512806
  -0.02157362]
 [-0.24109439  0.29149948 -0.21041165 ... -0.65514052 -0.46118037
  -0.86637886]]
[[5]
 [5]
 [5]
 ...
 [5]
 [6]
 [6]]


(None, None)

In [14]:
class Node():
    def __init__(self, feature = None,threshold = None, left = None, right = None, gain = None, value = None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value

In [18]:
class Decision_tree():
    def __init__(self, min_samples=2, max_depth=2):
        self.min_samples = min_samples
        self.max_depth = max_depth
        
    def split_Node(self, dataset, feature, threshold):
        left_node = []
        right_node = []
        for row in dataset:
            if row[feature] <= threshold:
                left_node.append(row)
            else:
                right_node.append(row)
        left_node = np.array(left_node)
        right_node = np.array(right_node)
        return left_node, right_node
    def compute_information_gain(self, parent, left, right):
        in_gain = np.var(parent) - (len(left) / len(parent)) * np.var(left) - (len(right) / len(parent)) * np.var(right)
        return in_gain
    def find_best_split(self, dataset, num_features):
        best_split = { 'gain': -1, 'feature': None, 'threshold': None}
        for feature in range(num_features):
            thresholds = np.unique(dataset[:, feature])
            for i in thresholds:
                left_dataset, right_dataset = self.split_Node( dataset, feature, i)
                if len(left_dataset) and len(right_dataset):
                    y, left_check, right_check = dataset[:,-1],left_dataset[:,-1], right_dataset[:,-1]
                    ig = self.compute_information_gain(y, left_check, right_check)
                    if ig >best_split['gain']:
                        best_split["gain"] = ig
                        best_split["feature"] = feature
                        best_split["threshold"]  = i
                        best_split["left_dataset"] = left_dataset
                        best_split["right_dataset"] = right_dataset
        return best_split
    def caculate_leaf_val(self, y):
        value = y.mean()
        return value
    def build_tree(self, dataset, cur_depth = 0):
        X,y = dataset[:, :-1], dataset[:,-1]
        n_samples, n_features = X.shape
        if cur_depth <= self.max_depth and n_samples >= self.min_samples:
            best_split = self.find_best_split(dataset, n_features)
            if best_split["gain"]:
                left_node = self.build_tree(best_split["left_dataset"], cur_depth + 1)
                right_node = self.build_tree(best_split["right_dataset"], cur_depth + 1)
                return Node(best_split["feature"], best_split["threshold"], left_node, 
                            right_node, best_split["gain"]) 
        leaf_value = self.caculate_leaf_val(y)
        return Node(value = leaf_value)
    def fit(self, X, y):
        dataset = np.concatenate((X,y), axis=1)
        self.root = self.build_tree(dataset)
    def make_predict(self, X, node):
        if node.value != None:
            return node.value
       
        feature = node.feature
        if X[feature] <= node.threshold:
            return self.make_predict(X, node.left)
        else:
            return self.make_predict(X, node.right)
    def predict(self, X):
        prediction = []
        for i in X:
            yhat = self.make_predict(i, self.root)
            prediction.append(yhat)
        np.array(prediction)
        return prediction

In [51]:
model = Decision_tree(8,4)
model.fit(X_train, y_train)

In [52]:
yhat = model.predict(X_test)

In [53]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, yhat))

0.6772665669047979

In [45]:
modelTest = DecisionTreeRegressor(
    criterion='squared_error',
    splitter='best',
    min_samples_split=8,
)

In [46]:
modelTest.fit(X_train, y_train)

In [47]:
Predictions = modelTest.predict(X_test)
np.sqrt(mean_squared_error(y_test, Predictions))

0.7761931579525685