![Imgur](https://i.imgur.com/A0T6cjl.png)
### [<font size = 2 color="#5890FF">SERIES 5</font>](https://www.kaggle.com/samybaladram/data-science-with-golf-dataset-series-index)<font size = 2 color="#7B8698">　‣　R E G R E S S I O N　🏌<br></font>[Dummy Regressor](https://www.kaggle.com/code/samybaladram/05-1-regression-dummy-regressor)・[kNN Regressor](https://www.kaggle.com/samybaladram/05-2-regression-knn-regressor)・**Regression Tree**・Linear Regression・Generalized Linear Model<p>
---

# About **Regression Tree**
![Imgur](https://i.imgur.com/EjstzU5.png)

# Main Mechanism
![Imgur](https://i.imgur.com/vvPze56.png)

![Imgur](https://i.imgur.com/eWS2RZI.png)

In [None]:
# IMPORTING DATASET #
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plt.rcParams['figure.dpi'] = 300

dataset_url = "/kaggle/input/golf-play-extended/golf_dataset_mini/golf_dataset_mini_original_numerical_with_testset.csv"

df = pd.read_csv(dataset_url)

print(df)

![Imgur](https://i.imgur.com/eWS2RZI.png)

# Algorithm Steps

# 　Preparation Steps
![Imgur](https://i.imgur.com/lBjTdnh.png)

In [None]:
df = pd.read_csv(dataset_url)

from sklearn.model_selection import train_test_split

# Set feature matrix X and target vector y
X, y = df.drop('Num_Players', axis=1), df['Num_Players']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=False)

# Displaying the train and test set
print(f"X_train, y_train:\n{pd.concat([X_train.round(2), y_train], axis=1)}\n")
print(f"X_test, y_test:\n{pd.concat([X_test.round(2), y_test], axis=1)}")

![Imgur](https://i.imgur.com/Oz8CpYr.png)

# 　Training Phase
![Imgur](https://i.imgur.com/uErFCko.png)
![Imgur](https://i.imgur.com/NGCfmoS.png)

In [None]:
def sort_attr_label(attr, lbl):
    return (pd.concat([attr, lbl], axis=1)
            .sort_values([attr.name, lbl.name])
            .reset_index()
            .rename(columns={'index': 'ID'})
            .set_index('ID'))

sort_attr_label(X_train['sunny'], y_train), sort_attr_label(X_train['Temperature'], y_train)

![Imgur](https://i.imgur.com/Dt1THTs.png)

In [None]:
def potential_split_points(attr):
    sorted_attr = np.sort(attr)
    unique_values = np.unique(sorted_attr)
    return [(unique_values[i] + unique_values[i+1]) / 2 for i in range(len(unique_values) - 1)]

print(potential_split_points(X_train['sunny']))
print(potential_split_points(X_train['Temperature']))

![Imgur](https://i.imgur.com/sykPhcG.png)

In [None]:
def mse_impurity(labels):
    n = len(labels)
    mean_value = np.mean(labels)
    mse = np.mean((labels - mean_value) ** 2)

    formula = f'1/{n} Σ(yi - ȳ)²'
    return n, formula, mse.round(3)

def mse_by_attribute(split_point, attr, labels):
    # Define ranges based on the split point
    low_range = f"≤ {split_point}"
    high_range = f"> {split_point}"

    # Divide data based on the split point
    low_data = labels[attr <= split_point]
    high_data = labels[attr > split_point]

    # Calculate MSE impurities for each subset
    data = [(low_range, *mse_impurity(low_data)),
            (high_range, *mse_impurity(high_data))]

    # Weighted average of MSE for the split
    weighted_mse = (len(low_data) * mse_impurity(low_data)[2] + len(high_data) * mse_impurity(high_data)[2]) / len(labels)

    return pd.DataFrame(data, columns=['Value', 'Σ', 'MSE_Formula', 'MSE']), weighted_mse.round(3)

# Sample call with dummy data
print(mse_by_attribute(0.5, X_train['sunny'], y_train))

![Imgur](https://i.imgur.com/u1EhW2i.png)

In [None]:
def potential_split_points(attr):
    sorted_attr = np.sort(attr)
    unique_values = np.unique(sorted_attr)
    return [(unique_values[i] + unique_values[i+1]) / 2 for i in range(len(unique_values) - 1)]

split_points = potential_split_points(X_train['Temperature'])

for point in split_points:
  print(f"Split Point: {point}")
  print(mse_by_attribute(point, X_train['Temperature'], y_train))
  print("\n")

![Imgur](https://i.imgur.com/vtvDSN6.png)

In [None]:
sunny_mse, sunny_weighted_mse = mse_by_attribute(0.5, X_train['sunny'], y_train)
print(sunny_mse[['Σ', 'MSE']])
print(f"Weighted MSE: {sunny_weighted_mse}")

![Imgur](https://i.imgur.com/nlfzlGu.png)

In [None]:
def evaluate_splits(df, labels):
    result = []
    for attr in df.columns:
        split_points = potential_split_points(df[attr])
        for point in split_points:
            _, weighted_mse = mse_by_attribute(point, df[attr], labels)
            result.append({
                'Attribute': attr,
                'Split_Point': point,
                'Weighted_MSE': weighted_mse.round(2)
            })
    return pd.DataFrame(result)

results_df = evaluate_splits(X_train, y_train)
print(results_df)

![Imgur](https://i.imgur.com/mKIsPZe.png)

In [None]:
df_rain = sort_attr_label(X_train['rain'], y_train)
print(df_rain)

![Imgur](https://i.imgur.com/nHZD1dJ.png)

In [None]:
# Second iteration of training
X_train2 = X_train.drop([3,4,5,9,13])
y_train2 = y_train.drop([3,4,5,9,13])
results_df2 = evaluate_splits(X_train2, y_train2)
print(results_df2)
print(sort_attr_label(X_train2['overcast'], y_train2))

![Imgur](https://i.imgur.com/zCezrRO.png)
![Imgur](https://i.imgur.com/MHnmmZv.png)

In [None]:
from sklearn.tree import DecisionTreeRegressor

# The whole Training Phase above is done inside sklearn like this
dt_reg = DecisionTreeRegressor(random_state=41)
dt_reg.fit(X_train, y_train)

from sklearn.tree import plot_tree

# Plot the decision tree
plt.figure(figsize=(20, 10))
plot_tree(dt_reg, filled=True, feature_names=X.columns)
plt.show()

# 　Regression Phase
![Imgur](https://i.imgur.com/QTVt60j.png)
![Imgur](https://i.imgur.com/G9otzxN.png)

In [None]:
# Make predictions
y_pred = dt_reg.predict(X_test)
print(y_pred)

# 　Evaluation Phase
![Imgur](https://i.imgur.com/4aqT1HH.png)

In [None]:
# Evaluate the Regressor
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")

# Key Parameters

# 　Depth of the tree
![Imgur](https://i.imgur.com/I7ehjBY.png)

In [None]:
labels, predictions, rmses = list(y_test), [], []

k_list = [2, 3, 4, None]
for k in k_list:
    dt_reg = DecisionTreeRegressor(max_depth=k, random_state=41)
    dt_reg.fit(X_train, y_train)
    y_pred = dt_reg.predict(X_test)
    predictions.append(list(y_pred))
    rmses.append(mean_squared_error(y_test, y_pred, squared=False).round(2))

df_predictions = pd.DataFrame({'Label': labels}, index=X_test.index)
for k, pred in zip(k_list, predictions):
    df_predictions[f'Depth_{k}'] = pred

df_rmses = pd.DataFrame({'     RMSE': rmses}, index=[f'Depth_{k}' for k in k_list]).T

print(df_predictions.round(1))
print(df_rmses)

# 　Minimum samples split
![Imgur](https://i.imgur.com/LVqrHTW.png)

In [None]:
# Initialize lists
labels, predictions, rmses = list(y_test), [], []

# Loop through different min samples split
k_list = [6,4, 3, 2]
for k in k_list:
    dt_reg = DecisionTreeRegressor(min_samples_split=k, random_state=41)
    dt_reg.fit(X_train, y_train)
    y_pred = dt_reg.predict(X_test)
    predictions.append(list(y_pred))
    rmses.append(mean_squared_error(y_test, y_pred, squared=False).round(2))

# Create DataFrames
df_predictions = pd.DataFrame({'Label': labels}, index=X_test.index)
for k, pred in zip(k_list, predictions):
    df_predictions[f'MinSplit_{k}'] = pred

df_rmses = pd.DataFrame({'     RMSE': rmses}, index=[f'MinSplit_{k}' for k in k_list]).T

print(df_predictions.round(1))
print(df_rmses)

# 　Criterion
![Imgur](https://i.imgur.com/nzEkgCP.png)

In [None]:
# Initialize lists
labels, predictions, rmses = list(y_test), [], []

# Loop through different criterion
criterions = ['squared_error','absolute_error','friedman_mse']
for measure in criterions:
    dt_reg = DecisionTreeRegressor(criterion=measure, random_state=42)
    dt_reg.fit(X_train, y_train)
    y_pred = dt_reg.predict(X_test)
    predictions.append(list(y_pred))
    rmses.append(mean_squared_error(y_test, y_pred, squared=False).round(2))

# Create DataFrames
df_predictions = pd.DataFrame({'Label': labels})
for criterion, pred in zip(criterions, predictions):
    df_predictions[f'{criterion}'] = pred

df_rmses = pd.DataFrame({'     RMSE': rmses}, index=[f'{criterion}' for criterion in criterions]).T

print(df_predictions)
print(df_rmses)

# Pros and Cons
![Imgur](https://i.imgur.com/XqSLwiM.png)

# 🌟 Decision Tree Regressor Simplified

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

df = pd.read_csv(dataset_url)

# Split train & test data
X, y = df.drop('Num_Players', axis=1), df['Num_Players']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=False)

# Fit the model
dt_reg = DecisionTreeRegressor(random_state=41)
dt_reg.fit(X_train, y_train)

# Predict the test data
y_pred = dt_reg.predict(X_test)

# Evaluate the model
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")

In [None]:
from sklearn.tree import plot_tree

# Plot the decision tree
plt.figure(figsize=(20, 10))
plot_tree(dt_reg, filled=True, feature_names=X.columns)
plt.show()

![Imgur](https://i.imgur.com/M0mwv0l.png)
### [<font size = 2 color="#5890FF">SERIES 5</font>](https://www.kaggle.com/samybaladram/golf-data-analysis-table-of-contents)<font size = 2 color="#7B8698">　‣　R E G R E S S I O N　🏌<br></font>[Dummy Regressor](https://www.kaggle.com/code/samybaladram/05-1-regression-dummy-regressor)・[kNN Regressor](https://www.kaggle.com/samybaladram/05-2-regression-knn-regressor)・**Regression Tree**・Linear Regression・Generalized Linear Model<p>

--- 
    
*Explore [diverse notebooks full of captivating visuals](https://www.kaggle.com/samybaladram/golf-data-analysis-table-of-contents), each tackling unique facets of the field—all based on some mini, yet enriching, golf play datasets. Your hole-in-one guide to mastering Data Science basics.*


### For a complete list of topics and series, see [⛳️ 📋 Data Science with Golf Dataset・Series Index](https://www.kaggle.com/samybaladram/data-science-with-golf-dataset-series-index).
---

**Your Feedback Matters! 🍃** If you find this notebook series useful, an ⬆️ *upvote* would be appreciated to help others in the community discover it too!