In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Mall_Customers.xls')
df.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [3]:
df.columns = ['сustomerid', 'genre', 'age', 'annual_income', 'spending_score']

In [4]:
df['genre'] = df['genre'].str.strip().map({'Male': 1, 'Female': 0})
df.head()

Unnamed: 0,сustomerid,genre,age,annual_income,spending_score
0,1,1,19,15,39
1,2,1,21,15,81
2,3,0,20,16,6
3,4,0,23,16,77
4,5,0,31,17,40


In [11]:
feature = ['genre', 'age', 'annual_income']
target = ['spending_score']

In [12]:
def calc_squared_error(values):

    if values.empty:
        return 0
    mean_value = values.mean()
    return ((values - mean_value) ** 2).mean()

In [13]:
def find_best_split(X, y):

    init_mse = calc_squared_error(y)
    max_mse_reduction = 0.0
    best_split = None
    
    for feature_name in X.columns:
        unique_values = sorted(X[feature_name].unique())
        potential_splits = []
        for i in range(len(unique_values) - 1):
            split_value = (unique_values[i] + unique_values[i+1]) / 2
            potential_splits.append(split_value)
            
        for split_value in potential_splits:
            mask = X[feature_name] <= split_value
            y_left = y[mask]
            y_right = y[~mask]
            
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            
            mse_left = calc_squared_error(y_left)
            mse_right = calc_squared_error(y_right)
            ratio_left = len(y_left) / len(y)
            ratio_right = len(y_right) / len(y)
            
            mse_reduction = init_mse - (mse_left * ratio_left + mse_right * ratio_right)
            
            if mse_reduction > max_mse_reduction:
                max_mse_reduction = mse_reduction
                best_split = (feature_name, split_value)
                
    return best_split

In [14]:
def fit_regression_tree(X, y, max_depth=None, min_samples_split=2, current_depth=0):

    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)

    best_split = find_best_split(X, y)

    if (max_depth is not None and current_depth >= max_depth) or \
       (len(X) < min_samples_split) or \
       (best_split is None):
        
        leaf_prediction = y.mean()
        print("|   " * current_depth + f"--- value: {leaf_prediction:.2f}")
        return leaf_prediction
    else:
        feature_name, split_value = best_split
        
        print("|   " * current_depth + f"--- {feature_name} <= {split_value:.2f}")
        
        mask = X[feature_name] <= split_value
        X_left, y_left = X[mask], y[mask]
        
        left_branch = fit_regression_tree(X_left, y_left, max_depth, min_samples_split, current_depth + 1)
        
        print("|   " * current_depth + f"--- {feature_name} > {split_value:.2f}")
        
        X_right, y_right = X[~mask], y[~mask]
        
        right_branch = fit_regression_tree(X_right, y_right, max_depth, min_samples_split, current_depth + 1)
        
        return {
            'feature': feature_name,
            'split_value': split_value,
            'left_branch': left_branch,
            'right_branch': right_branch
        }

In [15]:
features = df[['genre', 'age', 'annual_income']]
target = df['spending_score']

In [16]:
print("--- Строим дерево без ограничений: ---")
my_full_tree = fit_regression_tree(features, target)

--- Строим дерево без ограничений: ---
--- age <= 39.50
|   --- age <= 20.50
|   |   --- annual_income <= 69.00
|   |   |   --- annual_income <= 18.50
|   |   |   |   --- genre <= 0.50
|   |   |   |   |   --- value: 6.00
|   |   |   |   --- genre > 0.50
|   |   |   |   |   --- value: 39.00
|   |   |   --- annual_income > 18.50
|   |   |   |   --- annual_income <= 41.50
|   |   |   |   |   --- age <= 19.00
|   |   |   |   |   |   --- value: 92.00
|   |   |   |   |   --- age > 19.00
|   |   |   |   |   |   --- genre <= 0.50
|   |   |   |   |   |   |   --- value: 75.00
|   |   |   |   |   |   --- genre > 0.50
|   |   |   |   |   |   |   --- value: 66.00
|   |   |   |   --- annual_income > 41.50
|   |   |   |   |   --- annual_income <= 53.50
|   |   |   |   |   |   --- annual_income <= 47.00
|   |   |   |   |   |   |   --- value: 55.00
|   |   |   |   |   |   --- annual_income > 47.00
|   |   |   |   |   |   |   --- value: 59.00
|   |   |   |   |   --- annual_income > 53.50
|   |   |   |  

In [18]:
from sklearn.tree import DecisionTreeRegressor, export_text

print("--- Моё дерево с max_depth=2: ---")

my_depth_2_tree = fit_regression_tree(features, target, max_depth=2)

--- Моё дерево с max_depth=2: ---
--- age <= 39.50
|   --- age <= 20.50
|   |   --- value: 44.65
|   --- age > 20.50
|   |   --- value: 62.53
--- age > 39.50
|   --- annual_income <= 72.00
|   |   --- value: 41.77
|   --- annual_income > 72.00
|   |   --- value: 19.79


In [20]:
sklearn_model_depth_2 = DecisionTreeRegressor(max_depth=2, random_state=42)
sklearn_model_depth_2.fit(features, target)
sklearn_tree_text = export_text(sklearn_model_depth_2, feature_names=list(features.columns))

print("\n\n--- Дерево из sklearn с max_depth=2: ---")
print(sklearn_tree_text)



--- Дерево из sklearn с max_depth=2: ---
|--- age <= 39.50
|   |--- age <= 20.50
|   |   |--- value: [44.65]
|   |--- age >  20.50
|   |   |--- value: [62.53]
|--- age >  39.50
|   |--- annual_income <= 72.00
|   |   |--- value: [41.77]
|   |--- annual_income >  72.00
|   |   |--- value: [19.79]



In [21]:
print("--- Наше дерево с min_samples_split=70: ---")
my_mss_70_tree = fit_regression_tree(features, target, min_samples_split=70)

--- Наше дерево с min_samples_split=70: ---
--- age <= 39.50
|   --- age <= 20.50
|   |   --- value: 44.65
|   --- age > 20.50
|   |   --- age <= 32.50
|   |   |   --- value: 66.59
|   |   --- age > 32.50
|   |   |   --- value: 55.09
--- age > 39.50
|   --- annual_income <= 72.00
|   |   --- value: 41.77
|   --- annual_income > 72.00
|   |   --- value: 19.79


In [22]:
print("\n\n--- Дерево из sklearn с min_samples_split=70: ---")
sklearn_model_mss_70 = DecisionTreeRegressor(min_samples_split=70, random_state=42)
sklearn_model_mss_70.fit(features, target)

sklearn_tree_text_mss = export_text(sklearn_model_mss_70, feature_names=list(features.columns))
print(sklearn_tree_text_mss)



--- Дерево из sklearn с min_samples_split=70: ---
|--- age <= 39.50
|   |--- age <= 20.50
|   |   |--- value: [44.65]
|   |--- age >  20.50
|   |   |--- age <= 32.50
|   |   |   |--- value: [66.59]
|   |   |--- age >  32.50
|   |   |   |--- value: [55.09]
|--- age >  39.50
|   |--- annual_income <= 72.00
|   |   |--- value: [41.77]
|   |--- annual_income >  72.00
|   |   |--- value: [19.79]

