# Machine Learning

## Decision Tree As Regressor

In [53]:
# Importing Libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error,r2_score


In [54]:
# Importing Dataset
df = sns.load_dataset("tips")
print(df.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [56]:
# Check for missing values
print(df.isnull().sum().sort_values(ascending=False))


total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64


In [57]:
# Let's encode the object/category column in our dataset using LabelEncoder in For Loop
label_encoder = LabelEncoder()

# Loop through categorical columns and encode them
for column in df.select_dtypes(include=['category']).columns:
    df[column] = label_encoder.fit_transform(df[column])


In [58]:
# ...existing code...

# Remove outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df_no_outliers = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Scaling features (except the target 'tip')
scaler = MinMaxScaler()
feature_cols = df_no_outliers.columns.drop('tip')
df_no_outliers[feature_cols] = scaler.fit_transform(df_no_outliers[feature_cols])

# ...existing code...

# Split the data into X and y.
X = df_no_outliers.drop("tip", axis=1)
y = df_no_outliers["tip"]
# Splitting dataset into train and test by 80/20 ratio.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ...existing code...



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [59]:
%%time
# Train the Decision Tree model
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

# Predict the test data
y_pred_dt = dt.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_dt)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_dt)
r2 = r2_score(y_test, y_pred_dt)

# Show some actual and predicted values
comparison = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred_dt})
print(comparison.head(5))

print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R^2 Score: {r2:.3f}")

   Actual  Predicted
0    3.23       3.02
1    4.00       3.25
2    1.68       2.61
3    2.74       3.00
4    2.00       3.48
Mean Squared Error (MSE): 1.312
Root Mean Squared Error (RMSE): 1.146
Mean Absolute Error (MAE): 0.832
R^2 Score: -0.387
CPU times: total: 46.9 ms
Wall time: 870 ms


# Random Forest Regressor

In [60]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predict the test data
y_pred_rf = rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

# Show some actual and predicted values
comparison = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred_rf})
print(comparison.head(5))

print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R^2 Score: {r2:.3f}")

   Actual  Predicted
0    3.23     2.2328
1    4.00     2.8275
2    1.68     2.4981
3    2.74     2.6672
4    2.00     3.5352
Mean Squared Error (MSE): 0.683
Root Mean Squared Error (RMSE): 0.827
Mean Absolute Error (MAE): 0.635
R^2 Score: 0.278


## XG Boost Regressor

In [61]:
from xgboost import XGBRegressor

# Train the XGBoost model
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)

# Predict the test data
y_pred_xgb = xgb.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_xgb)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_xgb)
r2 = r2_score(y_test, y_pred_xgb)

# Show some actual and predicted values
comparison = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred_xgb})
print(comparison.head(5))

print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R^2 Score: {r2:.3f}")

   Actual  Predicted
0    3.23   2.256634
1    4.00   3.147958
2    1.68   2.204896
3    2.74   1.085196
4    2.00   3.648058
Mean Squared Error (MSE): 1.123
Root Mean Squared Error (RMSE): 1.060
Mean Absolute Error (MAE): 0.830
R^2 Score: -0.187


## Cat Boost Regressor

In [62]:
from catboost import CatBoostRegressor

# Train the CatBoost model
cat = CatBoostRegressor(verbose=0, random_state=42)
cat.fit(X_train, y_train)

# Predict the test data
y_pred_cat = cat.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_cat)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_cat)
r2 = r2_score(y_test, y_pred_cat)

# Show some actual and predicted values
comparison = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred_cat})
print(comparison.head(5))

print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R^2 Score: {r2:.3f}")

   Actual  Predicted
0    3.23   2.138998
1    4.00   2.801503
2    1.68   2.255374
3    2.74   2.537046
4    2.00   3.248859
Mean Squared Error (MSE): 0.731
Root Mean Squared Error (RMSE): 0.855
Mean Absolute Error (MAE): 0.716
R^2 Score: 0.228


## Visual Comparison

In [63]:
import plotly.graph_objs as go

fig = go.Figure()

# Decision Tree
fig.add_trace(go.Scatter(
    x=y_test, y=y_pred_dt,
    mode='markers',
    name='Decision Tree',
    marker=dict(color='blue', symbol='circle')
))

# Random Forest
fig.add_trace(go.Scatter(
    x=y_test, y=y_pred_rf,
    mode='markers',
    name='Random Forest',
    marker=dict(color='green', symbol='square')
))

# XGBoost
fig.add_trace(go.Scatter(
    x=y_test, y=y_pred_xgb,
    mode='markers',
    name='XGBoost',
    marker=dict(color='red', symbol='diamond')
))

# CatBoost
fig.add_trace(go.Scatter(
    x=y_test, y=y_pred_cat,
    mode='markers',
    name='CatBoost',
    marker=dict(color='orange', symbol='cross')
))

# Perfect prediction line
fig.add_trace(go.Scatter(
    x=y_test, y=y_test,
    mode='lines',
    name='Perfect Prediction (y=x)',
    line=dict(color='black', dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values (All Regressors)',
    xaxis_title='Actual',
    yaxis_title='Predicted',
    legend_title='Model',
    width=700,
    height=500
)

fig.show()