# Decision Tree (Regression)

In [25]:
import numpy as np
import pandas as pd

In [27]:
def r2_score(y_true, y_pred):
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_res = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_res / ss_tot)

In [29]:
def split_dataset(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

In [31]:
def best_split(X, y, min_samples_split=2, num_thresholds=10):
    best_feature_index = None
    best_threshold = None
    best_mse = float('inf')

    
    for feature_index in range(X.shape[1]):
        feature_values = X[:, feature_index]
        min_val, max_val = np.min(feature_values), np.max(feature_values)
        thresholds = np.linspace(min_val, max_val, num_thresholds)
        
        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)
            
            if len(y_left) < min_samples_split or len(y_right) < min_samples_split:
                continue
            
            left_mean = np.mean(y_left)
            right_mean = np.mean(y_right)
            mse_left = np.mean((y_left - left_mean) ** 2)
            mse_right = np.mean((y_right - right_mean) ** 2)
            weighted_mse = (len(y_left) * mse_left + len(y_right) * mse_right) / len(y)
            
            if weighted_mse < best_mse:
                best_mse = weighted_mse
                best_feature_index = feature_index
                best_threshold = threshold
                
    return best_feature_index, best_threshold

In [43]:
def build_tree(X, y, depth=0, max_depth=30, min_samples_split=5, num_thresholds=10):
    if depth >= max_depth or len(np.unique(y)) == 1:  # Stop recursion
        return np.mean(y)
    
    feature_index, threshold = best_split(X, y, min_samples_split, num_thresholds)
    if feature_index is None:
        return np.mean(y) 
    
    X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)
    
    left_tree = build_tree(X_left, y_left, depth + 1, max_depth, min_samples_split, num_thresholds)
    right_tree = build_tree(X_right, y_right, depth + 1, max_depth, min_samples_split, num_thresholds)
    
    return (feature_index, threshold, left_tree, right_tree)

In [45]:
def predict_tree(X, tree):
    if isinstance(tree, float):  
        return np.full(X.shape[0], tree)
    
    feature_index, threshold, left_tree, right_tree = tree
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    
    predictions = np.zeros(X.shape[0])
    predictions[left_mask] = predict_tree(X[left_mask], left_tree)
    predictions[right_mask] = predict_tree(X[right_mask], right_tree)
    
    return predictions

In [47]:
train = pd.read_csv('linear_regression_train.csv')
test = pd.read_csv('linear_regression_test.csv')
train_data = train.drop('ID', axis=1)
test_data = test.drop('ID', axis=1)

X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values
X_test = test_data.iloc[:, :].values

In [49]:
tree = build_tree(X_train, y_train, max_depth=30, min_samples_split=5, num_thresholds=10)

In [50]:
y_train_pred = predict_tree(X_train, tree)

r2 = r2_score(y_train, y_train_pred)
print(f"R² Score: {r2:.4f}")

R² Score: 0.8916


In [54]:
y_test_pred = predict_tree(X_test, tree)

output_path = "decision_tree_predictions(regression).csv"
output_df = pd.DataFrame({'ID': test['ID'],
'Predicted': y_test_pred})
output_df.to_csv(output_path, index=False)

print(f"Test predictions saved to {output_path}")


Test predictions saved to decision_tree_predictions(regression).csv
