In [43]:
!pip install yappi



In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Profile Code

## Preprocessing

In [52]:
%%time
import pandas as pd
import numpy as np
import yappi

yappi.clear_stats()
# Start profiling
yappi.start()

# Read the parquet file
df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/yellow_tripdata_2024-03.parquet')

# Select num_df_pandas variables
numerical_df_var = df[['fare_amount', 'passenger_count', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee']]

# Filter data
numerical_df_var = numerical_df_var[numerical_df_var[['fare_amount', 'passenger_count', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee']] >= 0]
numerical_var_above_zero = numerical_df_var[numerical_df_var['fare_amount'] > 0]

# Compute correlation matrix
correlation_matrix = numerical_var_above_zero.corr()
fare_amount_correlation = correlation_matrix["fare_amount"].drop("fare_amount")

# Select features based on correlation thresholds
threshold = 0.5  # Set your correlation threshold here
highly_correlated_features = fare_amount_correlation[fare_amount_correlation > 0.5]

highly_correlated_features = highly_correlated_features.index.values.tolist()
highly_correlated_features.append('fare_amount')
selected_features = numerical_var_above_zero[highly_correlated_features].dropna()

# Describe the selected features
print(selected_features.describe())

# Discretize the fare_amount
bins = [-float("inf"), 10, 20, float("inf")]
labels = ['Low', 'Medium', 'High']
selected_features['fare_discretized'] = pd.cut(selected_features['fare_amount'], bins=bins, labels=labels, right=False)

# Sample 100,000 rows from the selected features
selected_features = selected_features.loc[np.random.choice(selected_features.index, 100000, replace=False)]

# Prepare data for regression and classification
X_reg = selected_features[selected_features.columns.drop(['fare_amount', 'fare_discretized'])]
y_reg = selected_features['fare_amount']
X_cls = selected_features[selected_features.columns.drop(['fare_amount', 'fare_discretized'])]
y_cls = selected_features['fare_discretized']

# Stop profiling
yappi.stop()

# Get and sort function stats by total time (descending)
func_stats_preproc = yappi.get_func_stats().sort('ttot', 'desc')
# Get and sort thread stats by total time (descending)
thread_stats_preproc = yappi.get_thread_stats().sort('ttot', 'desc')

         tip_amount  tolls_amount  total_amount   Airport_fee   fare_amount
count  3.111328e+06  3.111328e+06  3.111328e+06  3.111328e+06  3.111328e+06
mean   3.587126e+00  6.074029e-01  2.840509e+01  1.482540e-01  1.924837e+01
std    4.107060e+00  2.238670e+00  2.282534e+01  4.873041e-01  1.807294e+01
min    0.000000e+00  0.000000e+00  1.010000e+00  0.000000e+00  1.000000e-02
25%    1.000000e+00  0.000000e+00  1.570000e+01  0.000000e+00  9.300000e+00
50%    2.860000e+00  0.000000e+00  2.060000e+01  0.000000e+00  1.280000e+01
75%    4.400000e+00  0.000000e+00  2.988000e+01  0.000000e+00  2.120000e+01
max    9.999900e+02  1.630000e+02  1.021990e+03  1.750000e+00  9.000000e+02
CPU times: user 3.88 s, sys: 2.96 s, total: 6.84 s
Wall time: 8.01 s


In [46]:
correlation_matrix

Unnamed: 0,fare_amount,passenger_count,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
fare_amount,1.0,0.04502,0.139472,-0.280406,0.575012,0.619981,-0.017737,0.975624,-0.336617,0.596249
passenger_count,0.04502,1.0,-0.04911,-0.028342,0.020128,0.033921,0.002919,0.043816,0.019905,0.024575
extra,0.139472,-0.04911,1.0,0.037295,0.25665,0.256674,0.021709,0.243379,-0.005723,0.343247
mta_tax,-0.280406,-0.028342,0.037295,1.0,-0.150582,-0.266516,0.215204,-0.265602,0.231243,-0.022608
tip_amount,0.575012,0.020128,0.25665,-0.150582,1.0,0.479679,0.0172,0.713214,-0.085932,0.423821
tolls_amount,0.619981,0.033921,0.256674,-0.266516,0.479679,1.0,0.004787,0.703711,-0.167718,0.484825
improvement_surcharge,-0.017737,0.002919,0.021709,0.215204,0.0172,0.004787,1.0,-0.00455,0.028192,0.00342
total_amount,0.975624,0.043816,0.243379,-0.265602,0.713214,0.703711,-0.00455,1.0,-0.283166,0.633596
congestion_surcharge,-0.336617,0.019905,-0.005723,0.231243,-0.085932,-0.167718,0.028192,-0.283166,1.0,-0.384995
Airport_fee,0.596249,0.024575,0.343247,-0.022608,0.423821,0.484825,0.00342,0.633596,-0.384995,1.0


## XGBRegressor

In [48]:
%%time
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
# Clear previous profiling stats
yappi.clear_stats()
yappi.start()

# Step 3: Training and Validation
# Regression (XGBRegressor)
xgb_reg = XGBRegressor()
params_reg = {'n_estimators': [5, 10], 'max_depth': [5, 7]}
grid_reg = GridSearchCV(xgb_reg, params_reg, cv=5)
grid_reg.fit(X_reg, y_reg)
best_reg_model = grid_reg.best_estimator_

# Stop profiling
yappi.stop()

# Get and sort function stats by total time (descending)
func_stats_xgbreg = yappi.get_func_stats().sort('ttot', 'desc')
# Get and sort thread stats by total time (descending)
thread_stats_xgbreg = yappi.get_thread_stats().sort('ttot', 'desc')

CPU times: user 13.3 s, sys: 243 ms, total: 13.6 s
Wall time: 16.1 s


## Logistic Regression

In [49]:
%%time
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Clear previous profiling stats
yappi.clear_stats()
yappi.start()

# Classification (Logistic Regression)
scaler = StandardScaler()
X_cls_scaled = scaler.fit_transform(X_cls)

log_reg = LogisticRegression()
params_cls = {'C': [0.1, 1, 10]}
grid_cls = GridSearchCV(log_reg, params_cls, cv=5)
grid_cls.fit(X_cls_scaled, y_cls)
best_cls_model = grid_cls.best_estimator_

# Stop profiling
yappi.stop()

# Get and sort function stats by total time (descending)
func_stats_lgstc_reg = yappi.get_func_stats().sort('ttot', 'desc')
# Get and sort thread stats by total time (descending)
thread_stats_lgstc_reg = yappi.get_thread_stats().sort('ttot', 'desc')

CPU times: user 19 s, sys: 11.6 s, total: 30.5 s
Wall time: 19 s


## Test and validation

In [50]:
%%time
yappi.clear_stats()
yappi.start()
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

# Example: Split data into train and test sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls_scaled, y_cls, test_size=0.2, random_state=42)

# Evaluate regression model
reg_predictions = best_reg_model.predict(X_test_reg)
reg_rmse = mean_squared_error(y_test_reg, reg_predictions, squared=False)
print(f"Regression RMSE: {reg_rmse}")

# Evaluate classification model
cls_predictions = best_cls_model.predict(X_test_cls)
cls_accuracy = accuracy_score(y_test_cls, cls_predictions)
cls_report = classification_report(y_test_cls, cls_predictions)

yappi.stop()
# Get and sort function stats by total time (descending)
func_stats_test_eval = yappi.get_func_stats().sort('ttot', 'desc')
# Get and sort thread stats by total time (descending)
thread_stats_test_eval = yappi.get_thread_stats().sort('ttot', 'desc')


print(f"Classification Accuracy: {cls_accuracy}")
print("Classification Report:")
print(cls_report)

Regression RMSE: 2.9163199843530925
Classification Accuracy: 0.9368
Classification Report:
              precision    recall  f1-score   support

        High       0.98      0.96      0.97      5404
         Low       0.91      0.93      0.92      5908
      Medium       0.93      0.92      0.93      8688

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000

CPU times: user 765 ms, sys: 45.3 ms, total: 811 ms
Wall time: 767 ms


# Profile Results

#### Preprocessing

In [57]:
# Print the top 10 functions
print("Top 10 Function Stats:")
for i, func in enumerate(func_stats_preproc[:10]):
    print(f"Function {i+1}:")
    print(f"  Name: {func.full_name}")
    print(f"  Total Time: {func.ttot}")
    print(f"  Number of Calls: {func.ncall}")
    print(f"  Average Time per Call: {func.tavg}")


# Print the top 10 threads
print("\nTop 10 Thread Stats:")
for i, thread in enumerate(thread_stats_preproc[:10]):
    print(f"Thread {i+1}:")
    print(f"  ID: {thread.id}")
    print(f"  Name: {thread.name}")
    print(f"  Total Time: {thread.ttot}")
    print(f"  Thread ID: {thread.tid}")
    print(f"  Schedule Count: {thread.sched_count}")

Top 10 Function Stats:
Function 1:
  Name: /usr/local/lib/python3.10/dist-packages/pandas/core/frame.py:3713 DataFrame.__getitem__
  Total Time: 1.2689787750000001
  Number of Calls: 12
  Average Time per Call: 0.10574823125
Function 2:
  Name: /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py:3940 DataFrame._take_with_is_copy
  Total Time: 1.251005576
  Number of Calls: 7
  Average Time per Call: 0.17871508228571428
Function 3:
  Name: /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py:3911 DataFrame._take
  Total Time: 1.2498863020000002
  Number of Calls: 7
  Average Time per Call: 0.17855518600000003
Function 4:
  Name: /usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py:929 BlockManager.take
  Total Time: 1.2490248430000002
  Number of Calls: 7
  Average Time per Call: 0.17843212042857146
Function 5:
  Name: /usr/local/lib/python3.10/dist-packages/pandas/core/frame.py:9980 DataFrame.corr
  Total Time: 1.213247274
  Number of Calls: 1


#### XGBRegressor

In [53]:

# Print the top 10 functions
print("Top 10 Function Stats:")
for i, func in enumerate(func_stats_xgbreg[:10]):
    print(f"Function {i+1}:")
    print(f"  Name: {func.full_name}")
    print(f"  Total Time: {func.ttot}")
    print(f"  Number of Calls: {func.ncall}")
    print(f"  Average Time per Call: {func.tavg}")

# Print the top 10 threads
print("\nTop 10 Thread Stats:")
for i, thread in enumerate(thread_stats_xgbreg[:10]):
    print(f"Thread {i+1}:")
    print(f"  ID: {thread.id}")
    print(f"  Name: {thread.name}")
    print(f"  Total Time: {thread.ttot}")
    print(f"  Thread ID: {thread.tid}")
    print(f"  Schedule Count: {thread.sched_count}")

Top 10 Function Stats:
Function 1:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:737 GridSearchCV.fit
  Total Time: 6.888995883000001
  Number of Calls: 1
  Average Time per Call: 6.888995883000001
Function 2:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:1386 GridSearchCV._run_search
  Total Time: 6.739133323000001
  Number of Calls: 1
  Average Time per Call: 6.739133323000001
Function 3:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:808 evaluate_candidates
  Total Time: 6.739036638000001
  Number of Calls: 1
  Average Time per Call: 6.739036638000001
Function 4:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py:40 Parallel.__call__
  Total Time: 6.727716314
  Number of Calls: 1
  Average Time per Call: 6.727716314
Function 5:
  Name: /usr/local/lib/python3.10/dist-packages/joblib/parallel.py:1902 Parallel.__call__
  Total Time: 6.727656280000001
  Nu

#### Logistic Regression

In [54]:
# Print the top 10 functions
print("Top 10 Function Stats:")
for i, func in enumerate(func_stats_lgstc_reg[:10]):
    print(f"Function {i+1}:")
    print(f"  Name: {func.full_name}")
    print(f"  Total Time: {func.ttot}")
    print(f"  Number of Calls: {func.ncall}")
    print(f"  Average Time per Call: {func.tavg}")
# Print the top 10 threads
print("\nTop 10 Thread Stats:")
for i, thread in enumerate(thread_stats_lgstc_reg[:10]):
    print(f"Thread {i+1}:")
    print(f"  ID: {thread.id}")
    print(f"  Name: {thread.name}")
    print(f"  Total Time: {thread.ttot}")
    print(f"  Thread ID: {thread.tid}")
    print(f"  Schedule Count: {thread.sched_count}")

Top 10 Function Stats:
Function 1:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:737 GridSearchCV.fit
  Total Time: 16.609687649
  Number of Calls: 1
  Average Time per Call: 16.609687649
Function 2:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py:40 Parallel.__call__
  Total Time: 16.293948879000002
  Number of Calls: 17
  Average Time per Call: 0.9584675811176472
Function 3:
  Name: /usr/local/lib/python3.10/dist-packages/joblib/parallel.py:1902 Parallel.__call__
  Total Time: 16.293888938000002
  Number of Calls: 17
  Average Time per Call: 0.9584640551764707
Function 4:
  Name: /usr/local/lib/python3.10/dist-packages/joblib/parallel.py:1819 Parallel._get_sequential_output
  Total Time: 16.293659769
  Number of Calls: 65
  Average Time per Call: 0.25067168875384616
Function 5:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py:111 _FuncWrapper.__call__
  Total Time: 16.263816197
  Number of Calls: 

#### Test and evaluation

In [55]:
# Print the top 10 functions
print("Top 10 Function Stats:")
for i, func in enumerate(func_stats_test_eval[:10]):
    print(f"Function {i+1}:")
    print(f"  Name: {func.full_name}")
    print(f"  Total Time: {func.ttot}")
    print(f"  Number of Calls: {func.ncall}")
    print(f"  Average Time per Call: {func.tavg}")

# Print the top 10 threads
print("\nTop 10 Thread Stats:")
for i, thread in enumerate(thread_stats_test_eval[:10]):
    print(f"Thread {i+1}:")
    print(f"  ID: {thread.id}")
    print(f"  Name: {thread.name}")
    print(f"  Total Time: {thread.ttot}")
    print(f"  Thread ID: {thread.tid}")
    print(f"  Schedule Count: {thread.sched_count}")

Top 10 Function Stats:
Function 1:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:2195 classification_report
  Total Time: 0.65119401
  Number of Calls: 1
  Average Time per Call: 0.65119401
Function 2:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1406 precision_recall_fscore_support
  Total Time: 0.59055164
  Number of Calls: 4
  Average Time per Call: 0.14763791
Function 3:
  Name: /usr/local/lib/python3.10/dist-packages/numpy/lib/arraysetops.py:138 unique
  Total Time: 0.5576074870000001
  Number of Calls: 81
  Average Time per Call: 0.006884043049382717
Function 4:
  Name: /usr/local/lib/python3.10/dist-packages/sklearn/utils/_array_api.py:83 _NumPyApiWrapper.unique_values
  Total Time: 0.556856322
  Number of Calls: 56
  Average Time per Call: 0.009943862892857144
Function 5:
  Name: /usr/local/lib/python3.10/dist-packages/numpy/lib/arraysetops.py:323 _unique1d
  Total Time: 0.553958622
  Number of Calls: 81