# Decision Tree

In [2]:
# 1. Import necessary libraries
import time  
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

start_time = time.time()

# 2. Load your preprocessed data
train_df = pd.read_csv("preprocessed_train3.csv")
test_df = pd.read_csv("preprocessed_test3.csv")

# 3. Define target
target_column = 'baseFare'

X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

X_test = test_df.drop(columns=[target_column])
y_test = test_df[target_column]

X_test = X_test[X_train.columns]

# 4. Initialize and train the model
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# 5. Predict on the test set
y_pred = regressor.predict(X_test)

# 6. Evaluate model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"\nTotal runtime: {elapsed_time:.2f} seconds")

Mean Squared Error (MSE): 21375.80
Root Mean Squared Error (RMSE): 146.20
Mean Absolute Error (MAE): 88.71
R² Score: 0.03

Total runtime: 34.79 seconds


# Decision Tree Regression with Hyperparameter Tuning (GridSearchCV)

In [4]:
start_time = time.time()

# 1. Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# 2. Load your train and test sets
train_df = pd.read_csv("preprocessed_train3.csv")  
test_df = pd.read_csv("preprocessed_test3.csv")    

# 3. Define target and features
target_column = 'baseFare'
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

X_test = test_df.drop(columns=[target_column])
y_test = test_df[target_column]

X_test = X_test[X_train.columns]

# 4. Set up the model and parameter grid for GridSearchCV
regressor = DecisionTreeRegressor(random_state=42)

param_grid = {
    'max_depth': [3, 5, 7, 10, 15, 20, None],
    'min_samples_leaf': [1, 2, 5, 10, 20]
}

# 5. Perform Grid Search with 5-fold Cross-Validation
grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")

# 6. Train the best model found
best_regressor = grid_search.best_estimator_
best_regressor.fit(X_train, y_train)

# 7. Predict on the specified test set
y_pred = best_regressor.predict(X_test)

# 8. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"\nTotal runtime: {elapsed_time:.2f} seconds")

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 20}
Mean Squared Error (MSE): 14807.22
Root Mean Squared Error (RMSE): 121.68
Mean Absolute Error (MAE): 78.81
R^2 Score: 0.33

Total runtime: 381.83 seconds
