In [8]:
import pandas                       as pd
import numpy                        as np
import matplotlib.pyplot            as plt
from sklearn.tree                   import DecisionTreeRegressor, plot_tree
from sklearn.model_selection        import train_test_split, KFold, cross_val_score
from sklearn.linear_model           import LinearRegression
from sklearn.preprocessing          import StandardScaler, PolynomialFeatures
from sklearn.compose                import ColumnTransformer
from sklearn.pipeline               import Pipeline
from sklearn.metrics                import mean_squared_error

# Read the input data
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/pu9kbeSaAtRZ7RxdJKX9_A/yellow-tripdata.csv'
raw_data = pd.read_csv(url)
raw_data.head(5)

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,mta_tax,tolls_amount,improvement_surcharge,tip_amount
0,2,1,17.63,2,1,132,164,1,70.0,0.5,6.94,1,16.54
1,2,1,19.52,2,1,132,236,1,70.0,0.5,6.94,1,16.19
2,2,1,17.81,2,1,132,48,1,70.0,0.5,6.94,1,12.0
3,2,2,19.3,2,1,132,148,1,70.0,0.5,0.0,1,5.0
4,2,1,18.75,2,1,132,234,1,70.0,0.5,6.94,1,10.0


In [9]:
# Split data into input features and target
X = raw_data.drop(columns=['tip_amount'])
y = raw_data['tip_amount']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor for regression tree
preprocessor_tree = ColumnTransformer([
    ('scaler', StandardScaler(), X_train.columns),
])

# Pipeline for regression tree
pipeline_tree_reg = Pipeline([
     ('preprocessor', preprocessor_tree),
     ('tree', DecisionTreeRegressor(criterion= 'squared_error', max_depth=2, random_state=42)),
])

# Preprocessor for polynomial model
preprocessor_poly = ColumnTransformer([
    ('scaler', StandardScaler(), X_train.columns),
    ('poly', PolynomialFeatures(degree=2, include_bias=False), X_train.columns),
])

# Pipeline for polynomial model
pipeline_poly = Pipeline([
     ('preprocessor', preprocessor_poly),
     ('poly_model', LinearRegression()),
])

In [None]:
# Define our cross validation technique as KFold
kfold = KFold(n_splits=10, shuffle = True, random_state=42)

# Calculate the cross validation MSE for both our regression tree and poly model
MSE_tree = -cross_val_score(pipeline_tree_reg, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error').mean()
MSE_poly = -cross_val_score(pipeline_poly, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error').mean()

# Calculate the RMSE for both our regression tree and poly model
RMSE_tree = np.sqrt(MSE_tree)
RMSE_poly = np.sqrt(MSE_poly)

print(f"TRAINING SET: \nRMSE using regression tree: {RMSE_tree}, RMSE using polynomial linear regression: {RMSE_poly}")

# Train our regression tree and poly model
tree_model = pipeline_tree_reg.fit(X_train, y_train)
poly_model = pipeline_poly.fit(X_train, y_train)

# Use these trained models to make predictions on our unseen X_test set.
y_pred_tree = tree_model.predict(X_test)
y_pred_poly = poly_model.predict(X_test)

# Compare these predictions to their real values to compute the MSE
MSE_tree = mean_squared_error(y_test, y_pred_tree)
MSE_poly = mean_squared_error(y_test, y_pred_poly)

# Use their MSE to calculate the RMSE of each model, now done on the test set data.
RMSE_tree = np.sqrt(MSE_tree)
RMSE_poly = np.sqrt(MSE_poly)

print(f"TEST SET: \nRMSE using regression tree: {RMSE_tree}, RMSE using polynomial linear regression: {RMSE_poly}")

# Display the tree to get an idea of how our predictions were made
plot_tree(tree_model["tree"], feature_names=X_train.columns)
plt.show()