In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/Patient-Recovery-Prediction-Challenge/sample_submission.csv
/kaggle/input/Patient-Recovery-Prediction-Challenge/train.csv
/kaggle/input/Patient-Recovery-Prediction-Challenge/test.csv


Functions to build the tree below

In [2]:
import math
import random
import csv
import time

# ====================================================================
# I. CORE HELPER FUNCTIONS
# ====================================================================

def mean(data):
    """Calculates the mean of a list of numerical values."""
    if not data: return 0
    return sum(data) / len(data)

def calculate_mse(data):
    """Calculates the Mean Squared Error (MSE) for a regression task."""
    if len(data) == 0: return 0
    target_values = [row[-1] for row in data]
    avg = mean(target_values)
    mse = sum([(val - avg) ** 2 for val in target_values]) / len(target_values)
    return mse

def calculate_rmse(predictions, actuals):
    """Calculates the Root Mean Squared Error (RMSE)."""
    if len(predictions) != len(actuals) or not predictions: return float('inf')
    mse = mean([(pred - actual) ** 2 for pred, actual in zip(predictions, actuals)])
    return math.sqrt(mse)

def split_data(data, feature_index, threshold):
    """Splits data based on a feature index and a threshold."""
    left = [row for row in data if row[feature_index] <= threshold]
    right = [row for row in data if row[feature_index] > threshold]
    return left, right

def bootstrap_sample(data, subsample_ratio=1.0):
    """Generates a random sample with replacement (bootstrap) using a specified ratio."""
    n_samples_full = len(data)
    n_samples = int(n_samples_full * subsample_ratio)
    
    if n_samples == 0 or n_samples_full == 0: return []
    
    sample = []
    for _ in range(n_samples):
        sample.append(random.choice(data))
    return sample

# ====================================================================
# II. CUSTOM DECISION TREE REGRESSOR
# ====================================================================

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class RegressionTree:
    def __init__(self, max_depth=10, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.max_features = max_features
        self.root = None
        self.leaf_node_count = 0

    def _get_best_split(self, data):
        best_gain = -1
        best_split = None
        if not data: return None

        n_features = len(data[0]) - 1
        features_to_try = range(n_features)
        
        # Feature Selection
        if self.max_features is not None and self.max_features < n_features:
            features_to_try = random.sample(range(n_features), self.max_features)

        initial_mse = calculate_mse(data)

        for feature_index in features_to_try:
            possible_thresholds = sorted(list(set([row[feature_index] for row in data])))
            
            for i in range(len(possible_thresholds) - 1):
                threshold = (possible_thresholds[i] + possible_thresholds[i+1]) / 2 

                left_data, right_data = split_data(data, feature_index, threshold)

                # Check split and leaf constraints
                if len(left_data) < self.min_samples_split or len(right_data) < self.min_samples_split: continue
                if len(left_data) < self.min_samples_leaf or len(right_data) < self.min_samples_leaf: continue

                n = len(data)
                n_left, n_right = len(left_data), len(right_data)
                weighted_mse = (n_left / n * calculate_mse(left_data)) + (n_right / n * calculate_mse(right_data))
                gain = initial_mse - weighted_mse

                if gain > best_gain:
                    best_gain = gain
                    best_split = {
                        'feature_index': feature_index,
                        'threshold': threshold,
                        'left': left_data,
                        'right': right_data
                    }
        
        return best_split

    def _build_tree(self, data, current_depth=0):
        target_values = [row[-1] for row in data]
        
        # Check Max Leaf Nodes limit before processing further splits
        is_max_leaves_reached = self.max_leaf_nodes is not None and self.leaf_node_count >= self.max_leaf_nodes

        # --- BASE CASE: Stop conditions ---
        if current_depth >= self.max_depth or \
           len(data) < self.min_samples_split or \
           len(data) < 2 * self.min_samples_leaf or \
           len(set(target_values)) <= 1 or \
           is_max_leaves_reached: # If global limit reached, force leaf
            
            # Create Leaf Node
            self.leaf_node_count += 1
            return Node(value=mean(target_values))
        
        split = self._get_best_split(data)

        if not split: 
            # If no good split found, create leaf
            self.leaf_node_count += 1
            return Node(value=mean(target_values))
        
        # --- RECURSIVE STEP ---
        left_child = self._build_tree(split['left'], current_depth + 1)
        right_child = self._build_tree(split['right'], current_depth + 1)

        return Node(
            feature_index=split['feature_index'],
            threshold=split['threshold'],
            left=left_child,
            right=right_child
        )

    def fit(self, data):
        self.leaf_node_count = 0 # Reset count for new tree
        self.root = self._build_tree(data)

    def _predict_one(self, sample, node):
        if node.value is not None: return node.value 
        
        feature_value = sample[node.feature_index]
        
        if feature_value <= node.threshold:
            return self._predict_one(sample, node.left)
        else:
            return self._predict_one(sample, node.right)

    def predict(self, samples):
        return [self._predict_one(sample, self.root) for sample in samples]


# ====================================================================
# III. RANDOM FOREST REGRESSOR
# ====================================================================

class RandomForestRegressor:
    def __init__(self, n_trees=100, max_depth=10, min_samples_split=2, min_samples_leaf=1, 
                 max_features='sqrt', max_leaf_nodes=None, subsample_ratio=1.0, 
                 random_data_seed=None, random_feature_seed=None):
        
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.subsample_ratio = subsample_ratio
        self.max_features_setting = max_features
        self.random_data_seed = random_data_seed
        self.random_feature_seed = random_feature_seed
        self.trees = []
        self.max_features = None

    def fit(self, data):
        self.trees = []
        if not data: return
        
        if self.random_data_seed is not None:
            random.seed(self.random_data_seed)

        n_features = len(data[0]) - 1
        
        # Calculate max_features integer value
        if isinstance(self.max_features_setting, str):
            if self.max_features_setting == 'sqrt':
                self.max_features = int(math.sqrt(n_features))
            elif self.max_features_setting == 'log2':
                self.max_features = int(math.log2(n_features))
            else:
                self.max_features = n_features
        elif isinstance(self.max_features_setting, int):
            self.max_features = self.max_features_setting
        else:
            self.max_features = n_features

        # print(f"Training {self.n_trees} trees...")
        for i in range(self.n_trees):
            
            # Set feature seed for reproducible feature selection within this tree
            if self.random_feature_seed is not None:
                random.seed(self.random_feature_seed + i) 
            
            # 1. Bootstrap Sample
            bootstrapped_data = bootstrap_sample(data, self.subsample_ratio) 
            
            # 2. Build Tree
            tree = RegressionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_leaf_nodes=self.max_leaf_nodes,
                max_features=self.max_features
            )
            tree.fit(bootstrapped_data)
            self.trees.append(tree)

    def predict(self, samples):
        if not self.trees: raise Exception("Model not fitted yet!")
            
        all_predictions = []
        for tree in self.trees:
            all_predictions.append(tree.predict(samples))
        
        transposed_preds = list(zip(*all_predictions))
        final_predictions = [mean(sample_preds) for sample_preds in transposed_preds]
        
        return final_predictions

Preprocess the data

In [None]:
# Function to calculate Root Mean Squared Error (RMSE)
def calculate_rmse(predictions, actuals):
    if len(predictions) != len(actuals) or not predictions:
        return float('inf')
    mse = mean([(pred - actual) ** 2 for pred, actual in zip(predictions, actuals)])
    return math.sqrt(mse)

def load_and_preprocess(file_path, is_train=True):
    """
    Loads data, extracts core features, adds 7 new engineered features, 
    and returns features (+ target if is_train=True).
    Assumes CSV structure: ID, F1, F2, F3, F4, F5, [TARGET]
    """
    data = []
    
    # Epsilon to prevent division by zero
    EPSILON = 1e-6 
    
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        header = next(reader) # Skip header row
        
        for row in reader:
            try:
                # 1. Extract Core Features (Indices 1 to 5)
                therapy_hours = float(row[1])      # F1
                health_score = float(row[2])       # F2
                lifestyle = 1.0 if row[3] == 'Yes' else 0.0 # F3
                sleep_hours = float(row[4])        # F4
                follow_up_sessions = float(row[5]) # F5
                
                # Start feature list with core numerical features (5 features)
                features = [therapy_hours, health_score, lifestyle, sleep_hours, follow_up_sessions]
                
                # 2. FEATURE ENGINEERING (Adding 7 New Features)
                
                # E1: Therapy-Session Load (Interaction)
                features.append(therapy_hours * follow_up_sessions) 
                
                # E2: Therapy-to-Health Ratio (Ratio)
                features.append(therapy_hours / (health_score + EPSILON))

                # E3: Therapy-to-Sleep Ratio (Ratio)
                features.append(therapy_hours / (sleep_hours + EPSILON))
                
                # E4: High Sessions Flag (Threshold) - Using 15 as a high threshold
                features.append(1.0 if follow_up_sessions > 15 else 0.0) 
                
                # E5: Sleep-Lifestyle Harmony (Interaction)
                features.append(sleep_hours * lifestyle)
                
                # E6: Health-to-Sleep Ratio (Ratio)
                features.append(health_score / (sleep_hours + EPSILON))

                # E7: Sleep Squared (Transformation for non-linearity)
                features.append(sleep_hours ** 2)
                
                # 3. Add Target for Training
                if is_train:
                    target = float(row[6])
                    data.append(features + [target])
                else:
                    # For test data, just append features
                    data.append(features)
            
            except (ValueError, IndexError) as e:
                print(f"Error processing row: {row}. Skipping. Error: {e}")
                
    return data

def grid_search_tuning(train_set, val_features, val_targets, param_grid):
    best_rmse = float('inf')
    best_params = {}
    
    print("\n--- Starting Grid Search Tuning ---")
    print(f"Total combinations to test: {len(param_grid)}")

    start_time = time.time()
    
    for i, params in enumerate(param_grid):
        # Time each run to track efficiency
        run_start = time.time()
        
        # Ensure random seed is set for data sampling and feature selection
        if 'random_data_seed' not in params: params['random_data_seed'] = 42
        if 'random_feature_seed' not in params: params['random_feature_seed'] = 100
        
        print(f"\n[{i+1}/{len(param_grid)}] Testing: {params}")
        
        try:
            rf = RandomForestRegressor(**params)
            rf.fit(train_set)
            
            val_predictions = rf.predict(val_features)
            rmse = calculate_rmse(val_predictions, val_targets)
            
            run_time = time.time() - run_start
            print(f"  -> Validation RMSE: {rmse:.4f} (Time: {run_time:.2f}s)")

            if rmse < best_rmse:
                best_rmse = rmse
                best_params = params
                print("  (NEW BEST MODEL FOUND!)")
                
        except Exception as e:
            print(f"  ERROR during training/prediction: {e}")
            
    total_time = time.time() - start_time
    
    print("\n--- Grid Search Complete ---")
    print(f"Total search time: {total_time:.2f} seconds")
    return best_params, best_rmse

# --- EXECUTION ---

# !!! CRITICAL: UPDATE THIS PATH !!!
TRAIN_FILE = '/kaggle/input/Patient-Recovery-Prediction-Challenge/train.csv'

try:
    full_train_data = load_and_preprocess(TRAIN_FILE, is_train=True)
except FileNotFoundError:
    print(f"ERROR: Train file not found at {TRAIN_FILE}. Please correct the path.")
    raise

# Set seed for reproducible data split
random.seed(42) 
random.shuffle(full_train_data)
split_ratio = 0.8
split_point = int(len(full_train_data) * split_ratio)

train_data = full_train_data[:split_point]
validation_data = full_train_data[split_point:]

val_features = [row[:-1] for row in validation_data]
val_targets = [row[-1] for row in validation_data]

print(f"Total records: {len(full_train_data)} | Train: {len(train_data)} | Validation: {len(validation_data)}")
print(f"Number of features (Core + Engineered): {len(train_data[0]) - 1}") # Should be 12 features


# Assuming 'fixed_params', 'grid_search_tuning', and the data are loaded/defined above.

# --- AGGRESSIVE HIGH-RESOLUTION PARAMETER DEFINITIONS ---
fine_n_trees = [100]
fine_max_depth = [14, 17] 
fine_max_features = [4, 6, 8, 10] 
fine_min_leaf = [3, 5]
fine_subsample_ratio = [0.85] # Introducing subsample ratio into the loop
fine_max_leaf_nodes = [None] 

param_grid_aggressive = []
random_seed = 42

# Re-establish a sensible 'fixed_params':
fixed_params = {
    'min_samples_split': 6,
    'random_data_seed': 42,
    'random_feature_seed': 100
}

# --- GENERATE AGGRESSIVE GRID (2 * 3 * 4 * 2 * 2 * 1 = 96 Combinations) ---
for n_t in fine_n_trees:
    for m_d in fine_max_depth:
        for m_f in fine_max_features:
            for m_l in fine_min_leaf:
                for s_r in fine_subsample_ratio:
                    for M_L_N in fine_max_leaf_nodes:
                        params = fixed_params.copy()
                        params.update({
                            'n_trees': n_t,
                            'max_depth': m_d,
                            'max_features': m_f,
                            'min_samples_leaf': m_l,
                            'subsample_ratio': s_r,
                            'max_leaf_nodes': M_L_N
                        })
                        param_grid_aggressive.append(params)

print(f"Total combinations for Aggressive Retuning Grid: {len(param_grid_aggressive)}")

# --- EXECUTE THE GRID SEARCH ---
# NOTE: This assumes 'train_data', 'val_features', and 'val_targets' 
# have been correctly loaded with the new 12 features.
best_params, best_rmse = grid_search_tuning(train_data, val_features, val_targets, param_grid_aggressive)

print("\n=========================================================")
print("  FINAL BEST MODEL RESULTS (Aggressive Retuning with 12 Features)")
print(f"  Best Parameters: {best_params}")
print(f"  Best Validation RMSE: {best_rmse:.4f}")
print("=========================================================")

Total records: 8000 | Train: 6400 | Validation: 1600
Number of features (Core + Engineered): 12
Total combinations for Aggressive Retuning Grid: 16

--- Starting Grid Search Tuning ---
Total combinations to test: 16

[1/16] Testing: {'min_samples_split': 6, 'random_data_seed': 42, 'random_feature_seed': 100, 'n_trees': 100, 'max_depth': 14, 'max_features': 4, 'min_samples_leaf': 3, 'subsample_ratio': 0.85, 'max_leaf_nodes': None}
  -> Validation RMSE: 2.1511 (Time: 171.99s)
  (NEW BEST MODEL FOUND!)

[2/16] Testing: {'min_samples_split': 6, 'random_data_seed': 42, 'random_feature_seed': 100, 'n_trees': 100, 'max_depth': 14, 'max_features': 4, 'min_samples_leaf': 5, 'subsample_ratio': 0.85, 'max_leaf_nodes': None}
  -> Validation RMSE: 2.1535 (Time: 169.26s)

[3/16] Testing: {'min_samples_split': 6, 'random_data_seed': 42, 'random_feature_seed': 100, 'n_trees': 100, 'max_depth': 14, 'max_features': 6, 'min_samples_leaf': 3, 'subsample_ratio': 0.85, 'max_leaf_nodes': None}
  -> Validatio

In [None]:
import csv
# (Ensure RandomForestRegressor and load_and_preprocess are defined in your notebook)

# --- 1. SETUP: FINAL TRAINED MODEL AND PATHS ---

# You must redefine your final parameters here based on your BEST TUNING RESULT (RMSE: 2.1275)
final_params = {
    'max_features': 3, 'subsample_ratio': 0.85, 'min_samples_split': 6, 
    'min_samples_leaf': 5, 'random_data_seed': 42, 'random_feature_seed': 100, 
    'n_trees': 250, 'max_depth': 13, 'max_leaf_nodes': None
}

# Fix 1: Correct TRAIN_FILE path (assuming this is the training data path)
TRAIN_FILE = '/kaggle/input/Patient-Recovery-Prediction-Challenge/train.csv'

# --- CORRECTED TRAINING DATA LOAD ---
try:
    # FIX: is_train must be True to load the target column required for training
    full_train_data = load_and_preprocess(TRAIN_FILE, is_train=True) 
    print(f"Loaded {len(full_train_data)} records for final training.")
except FileNotFoundError:
    print(f"ERROR: Train file not found at {TRAIN_FILE}. Please correct the path.")
    raise

# 1. Instantiate the final model
final_rf = RandomForestRegressor(**final_params)

# 2. Train on ALL available training data
print("Starting FINAL training...")
final_rf.fit(full_train_data) # This will now work as full_train_data has the target
print("Final model successfully trained.")

# !!! CRITICAL: UPDATE THIS PATH TO YOUR KAGGLE TEST FILE LOCATION !!!
TEST_FILE = '/kaggle/input/Patient-Recovery-Prediction-Challenge/test.csv' 


# --- 2. FUNCTION TO LOAD TEST DATA (Includes ID Handling) ---

def load_test_data(file_path):
    """Loads test data, extracting IDs and features using indices consistent with training."""
    test_features = []
    test_ids = []
    
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip header
        
        # Assumption based on error fixing: Test file has an ID column at index 0.
        
        for row in reader:
            test_ids.append(row[0])  # ID is the first column (index 0)
            
            try:
                # Features start at index 1 (F1=Therapy)
                
                # Therapy Hours (1), Initial Health Score (2)
                features = [float(row[1]), float(row[2])] 
                
                # Lifestyle Activities (3): 'Yes' -> 1.0, 'No' -> 0.0
                features.append(1.0 if row[3] == 'Yes' else 0.0)
                
                # Average Sleep Hours (4), Follow-Up Sessions (5)
                features.extend([float(row[4]), float(row[5])])
                
                test_features.append(features)
                
            except (ValueError, IndexError) as e:
                print(f"Error processing row: {row}. Check test file column indices. Error: {e}")
                
    return test_ids, test_features


# --- 3. EXECUTION: LOAD, PREDICT, AND SUBMIT ---

print("Starting prediction process...")

# Load Test Data
try:
    test_ids, test_features = load_test_data(TEST_FILE)
    print(f"Successfully loaded {len(test_features)} test samples.")
except FileNotFoundError:
    print(f"FATAL ERROR: Test file not found at {TEST_FILE}. FIX THE PATH.")
    raise

# Generate Predictions
test_predictions = final_rf.predict(test_features)

# Round the predictions to the nearest integer (standard for many Kaggle regression targets)
final_predictions_rounded = [round(p) for p in test_predictions]

# Create Submission File
SUBMISSION_FILE = 'submission.csv'
submission_rows = []
for patient_id, prediction in zip(test_ids, final_predictions_rounded):
    submission_rows.append([patient_id, prediction])

with open(SUBMISSION_FILE, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Recovery Index']) # Standard Kaggle submission header
    writer.writerows(submission_rows)

print("\n------------------------------------------------------")
print(f"✅ Submission file '{SUBMISSION_FILE}' created successfully.")
print(f"Predictions generated for {len(submission_rows)} patients.")
print("Your custom Random Forest project is complete! You can now submit this file.")
print("------------------------------------------------------")