In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import tqdm
import timeit
import textwrap
import sklearn
import xgboost
import unittest

from scipy import stats
from scipy.stats import zscore
from scipy.stats import skew
from scipy.optimize import curve_fit

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.model_selection import GridSearchCV

pd.options.mode.chained_assignment = None

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

In [41]:
path = 'C:/Users/delst/OneDrive - Queen Mary, University of London/Desktop/VSCode/Advanced_Projects/Uber_Demand_Project/data_archive/*.csv'
input_files = glob.glob(path)

In [42]:
def file_load(path):
        
    for i, file in enumerate(input_files):
        print(file)
        globals()[f'df{i+1}'] = pd.read_csv(input_files[i])
    print(f'Total number of files loaded: {len(input_files)}')
    
file_load(path)

C:/Users/delst/OneDrive - Queen Mary, University of London/Desktop/VSCode/Advanced_Projects/Uber_Demand_Project/data_archive\uber.csv
Total number of files loaded: 1


In [43]:
df_store = [eval(f'df{i+1}') for i in range(len(input_files))]   # Dataframe preperation

In [44]:
class Dataset:
    def __init__(self, data, threshold):
        self.data = data
        self.threshold = threshold
    
    def data_transform(self):
        self.data['pickup_datetime'] = pd.to_datetime(self.data['pickup_datetime']).dt.tz_localize(None)
        self.data['pickup_datetime'] = self.data['pickup_datetime'].dt.floor('H')
        
        self.data['Label_Hour'] = self.data['pickup_datetime'].dt.hour
        self.data['Label_Date'] = self.data['pickup_datetime'].dt.date

        self.data['day_name'] = pd.to_datetime(self.data['pickup_datetime']).dt.day_name()
        self.data['day_of_the_week'] = pd.to_datetime(self.data['pickup_datetime']).dt.weekday
        return self.data
            
    def data_remove_outliers(self):
        self.data = self.data[(np.abs(stats.zscore(self.data['passenger_count'])) < 2)]
        self.data = self.data[(np.abs(stats.zscore(self.data['fare_amount'])) < 2)]
        return self.data

    def data_clean(self):
        self.data['fare_amount'] = abs(self.data.fare_amount)
        self.data.passenger_count.replace(0, 1, inplace=True)
        self.data.fare_amount.replace(0, 1, inplace=True)
        self.data = self.data.dropna()
        return self.data
    
    def winsorization(self):
        median = self.data['sPED'].median()
        lower_quantile = self.data['sPED'].quantile(self.threshold)
        upper_quantile = self.data['sPED'].quantile(1-self.threshold)
        self.data['sPED'][self.data['sPED'] < lower_quantile] = -1*median
        self.data['sPED'][self.data['sPED'] > upper_quantile] = median
        return self.data
    
    def index_set(self):
        self.data = self.data.set_index('pickup_datetime')
        self.data.index = self.data.index.strftime('%Y-%m-%d-%H')
        return self.data

class GroupClass:
    def __init__(self, data, col_group):
        self.data = data
        self.col_group = col_group
    
    def group_by_hour(self, col_group):
        self.data = self.data[col_group].groupby(['pickup_datetime'])['passenger_count','fare_amount'].agg({'passenger_count':'mean','fare_amount':'mean'}).reset_index()
        return self.data
    
    def group_by_day(self, col_group):
        self.data = self.data[col_group].groupby(['Label_Date'])['passenger_count','fare_amount'].agg({'passenger_count':'mean','fare_amount':'mean'}).reset_index()
        return self.data
    
class FeatureExtraction:
    def __init__(self, data, col_feature, n_window):
        self.data = data
        self.col_feature = col_feature
        self.n_window = n_window

    def calc_pct_change(self, col_feature):
        for col in col_feature:
            self.data[col + '_pct_change'] = self.data[col].pct_change()
        return self.data
    
    def calc_sma(self, col, n_window):
        self.data[col + '_sma'] = self.data[col].rolling(window=self.n_window).mean()
        return self.data
        
    def calc_ema(self, col, n_window):
        self.data[col + '_ema'] = self.data[col].ewm(span=n_window, adjust=False).mean()
        return self.data
        
    def calc_sma_PED(self):
        self.data['sPED'] = self.data.passenger_count_pct_change_sma / self.data.fare_amount_pct_change_sma
        return self.data
        
    def calc_ema_PED(self):
        self.data['ePED'] = self.data.passenger_count_pct_change_ema / self.data.fare_amount_pct_change_ema
        return self.data

class ModelSelection:
    def __init__(self, data, input_columns, target_column, cv_splits, test_split):
        self.data = data
        self.input_columns = input_columns
        self.target_column = target_column
        self.cv_splits = cv_splits
        self.test_split = test_split
        # self.models = [DecisionTreeRegressor(random_state=0), LinearRegression(),
        #                RandomForestRegressor(), 
        #                SVR(C=1.0, epsilon=0.2), 
        #                xgboost.XGBRegressor(n_estimators=100, max_depth=5, eta=0.1, subsample=1-test_split)]
        self.models = {
            'DecisionTreeRegressor': DecisionTreeRegressor(random_state=0),
            'LinearRegression': LinearRegression(),
            'RandomForestRegressor': RandomForestRegressor(), 
            'SVR': SVR(C=1.0, epsilon=0.2), 
            'XGBRegressor': xgboost.XGBRegressor(n_estimators=100, max_depth=5, eta=0.1, subsample=1-test_split)
        }
        
    def split_dataset(self):
        input_features = self.data.drop(input_columns, axis=1)
        target_variable = self.data[target_column]
        X_train, X_test, y_train, y_test = train_test_split(input_features, target_variable, shuffle=False, test_size=test_split)
        return X_train, X_test, y_train, y_test
    
    def model_evaluation(self, model, X_train, y_train, cv_splits):
        std_clf = make_pipeline(StandardScaler(), model)

        r2_scores = []
        mae_scores = []
        rmse_scores = []

        kf = KFold(n_splits=cv_splits, shuffle=False)
        for train_index, test_index in kf.split(X_train):
            X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

            std_clf.fit(X_train_cv, y_train_cv)
            y_pred = std_clf.predict(X_test_cv)

            r2_scores.append(r2_score(y_test_cv, y_pred))
            mae_scores.append(mean_absolute_error(y_test_cv, y_pred))
            rmse_scores.append(np.sqrt(mean_squared_error(y_test_cv, y_pred)))

        return {"Mean R^2": sum(r2_scores) / cv_splits, 
                "Mean MAE": sum(mae_scores) / cv_splits,
                "Mean RMSE": sum(rmse_scores) / cv_splits}
        
    def run_model_evaluation(self):
        X_train, X_test, y_train, y_test = self.split_dataset()
        results = {}
        for model_name, model in self.models.items():
            results[model_name] = self.model_evaluation(model, X_train, y_train, cv_splits)
        return results
    
    def select_initial_model(self):
        X_train, X_test, y_train, y_test = self.split_dataset()
        best_model = None
        best_score = 0
        for model_name, model in self.models.items():
            # Define hyperparameters for grid search
            if model_name == 'DecisionTreeRegressor':
                params = {'max_depth': [5, 10, 15]}
            elif model_name == 'LinearRegression':
                params = {'fit_intercept': [True, False]}
            elif model_name == 'RandomForestRegressor':
                params = {'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15]}
            elif model_name == 'SVR':
                params = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}
            elif model_name == 'XGBRegressor':
                params = {'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 150]}
                
            # Perform grid search
            grid = GridSearchCV(estimator=model, param_grid=params, cv=self.cv_splits, n_jobs=-1)
            grid.fit(X_train, y_train)
            
            # Determine if this model is the best so far
            if grid.best_score_ > best_score:
                best_score = grid.best_score_
                best_model = grid.best_estimator_
                best_params = grid.best_params_
        
        best_model_name = type(best_model).__name__
        if best_model_name == 'XGBRegressor':
            model_class = xgboost.XGBRegressor
        else:
            model_class = getattr(sklearn.ensemble, best_model_name)
        selected_model = model_class(**best_params)
        
        print('\n')
        print("Selected model: {}".format(type(best_model).__name__))
        print("Hyperparameters: {}".format(best_params))
        print("Model score: {}".format(best_score))
        print('\n')

        return selected_model
            
class Model:
    def __init__(self, data, X_train, y_train, selected_model, cv_splits):
        self.data = data
        self.X_train = X_train
        self.y_train = y_train
        self.selected_model = selected_model
        self.cv_splits = cv_splits

    def train_model(self):
        std_clf = make_pipeline(StandardScaler(), self.selected_model)
        std_clf.fit(X_train_cv, y_train_cv)
        self.std_clf = std_clf

        kf = KFold(n_splits=self.cv_splits, shuffle=False)
        train_results = []
        
        for train_index, test_index in kf.split(kf.split(self.X_train)):
            X_train_cv, X_test_cv = self.X_train.iloc[train_index], self.X_train.iloc[test_index]
            y_train_cv, y_test_cv = self.y_train.iloc[train_index], self.y_train.iloc[test_index] 
            apply = self.std_clf.score(X_train_cv, y_train_cv)
            train_results.append(apply)
        
        self.train_results = train_results
        return self.std_clf, self.train_scores, kf

    def cv_model(self):
        y_pred_train = cross_val_predict(self.std_clf, self.X_train, self.y_train, cv=self.kf)
        
        r2 = r2_score(self.y_train, y_pred_train)
        mae = mean_absolute_error(self.y_train, y_pred_train)
        rmse = np.sqrt(mean_squared_error(self.y_train, y_pred_train))
        
        cv_metrics = {'Initial_Model': self.selected_model,
              "Mean R^2": r2, 
              "Mean MAE": mae,
              "Mean RMSE": rmse}
        return cv_metrics
    
    def validate_model(self, X_test, y_test):
        y_pred = self.std_clf.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        validation_metrics = {'Initial_Model': self.selected_model,
              "Mean R^2": r2, 
              "Mean MAE": mae,
              "Mean RMSE": rmse}
        return validation_metrics

    def test_model(self, X_test, y_test):
        y_pred = self.std_clf.predict(X_test)
        return y_pred
    
    def visualize_results(self, X_test, y_test, y_pred):
        fig, ax = plt.subplots(1, 1, figsize=(32, 10))
        plt.plot(y_test)
        plt.plot(y_pred)
        plt.xlabel("Date time")
        plt.ylabel("Price Elasticity of Demand")
        plt.xticks(np.arange(0, len(y_test), 20), rotation='vertical')
        plt.show()

    def initial_model_metrics(self, X_test, y_test, y_pred):
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        initial_metrics = {'Initial_Model': self.selected_model,
              "Mean R^2": r2, 
              "Mean MAE": mae,
              "Mean RMSE": rmse}
        return initial_metrics

---

# **Initial Exploration**

In [45]:
df = df_store[0]
df0 = df_store[0]
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [46]:
for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col}: {dtype}")

Unnamed: 0: int64
key: object
fare_amount: float64
pickup_datetime: object
pickup_longitude: float64
pickup_latitude: float64
dropoff_longitude: float64
dropoff_latitude: float64
passenger_count: int64


In [47]:
df = df[['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime','passenger_count']]
frac = 0.01
df_frac = len(df)*frac
df = df.iloc[:int(df_frac),:]
df0 = df0.iloc[:int(df_frac),:]

---

In [48]:
class InspectData:
    def __init__(self, raw_data, data, expected_dtypes, processed_dtypes, inspect_cols, eng_feature_cols, tagret_col):
        self.raw_data = raw_data
        self.data = data
        self.expected_dtypes = expected_dtypes
        self.processed_dtypes = processed_dtypes
        self.inspect_cols = inspect_cols
        self.eng_feature_cols = eng_feature_cols
        self.target_col = target_col
    
    def inspection_1(self):
        # BEFORE PREPROCESSING
        
        # Test 1: Ensure data is not empty
        if self.data.empty:
            raise ValueError('Inspection_1 Part 1 FAILURE: Input data is empty')
        else:
            print('Inspection_1 Part 1 PASS: Input data is not empty')

        # Test 2: Check for missing values
        if self.data.isnull().values.any():
            return "Inspection_1 Part 2 FAILURE: Input data contains missing values"
        else:
            print('Inspection_1 Part 2 PASS: Data has no missing values')
        
        # Test 3: Check for NaN values    
        if self.data.isna().values.any():
            return "Inspection_1 Part 3 FAILURE: Input data contains NaN values"
        else:
            print('Inspection_1 Part 3 PASS: Data has no NaN values')
        
        # Test 4: Check for duplicate rows
        if self.data.duplicated().any():
            return "Inspection_1 Part 4 CAUTION: Input data contains duplicate rows"
        else:
            print("Inspection_1 Part 4 PASS: No duplicate rows found")
            
        # Test 5: Ensure data types are as expected
        mismatched_columns = []
        for col, dtype in self.expected_dtypes.items():
            if self.data[col].dtype != dtype:
                mismatched_columns.append((col, self.data[col].dtype, dtype))
        
        if mismatched_columns:
            column_info = '\n'.join([f"{col}: expected {expected}, got {actual}" for col, actual, expected in mismatched_columns])
            raise ValueError(f"Inspection_1 Part 5 FAILURE: Columns {column_info}. Unexpected data types.")
        else:
            print('Inspection_1 Part 5 PASS: All columns have expected data types')

        outlier_cols = []
        for col in self.inspect_cols:
            z_score = np.abs((self.data[col] - self.data[col].mean()) / self.data[col].std())
            if z_score.max() > 3:
                outlier_cols.append(col)
        if outlier_cols:
            print(f"Inspection_1 Part 6 CAUTION: Columns {outlier_cols} contain outliers")
        else:
            print("Inspection_1 Part 6 PASS: No columns have outliers")

    def inspection_2(self):
        
        # Test 1: Outlier analysis
        length_raw_data = len(self.raw_data)
        length_data = len(self.data)
        perc_change = length_data / length_raw_data
        
        if length_raw_data == length_data:
            return print('Inspection_2 Part 1 FAILURE: Outliers have not been removed')
        else:
            print(f'Inspection_2 Part 1 PASS: Outliers removed, resultant dataset at {perc_change*100}%.')
            
        # Test 2: Check for missing values
        if self.data.isnull().values.any():
            return "Inspection_2 Part 2 FAILURE: Input data contains missing values"
        else:
            print('Inspection_2 Part 2 PASS: Data has no missing values')
        
        # Test 3: Check for NaN values    
        if self.data.isna().values.any():
            return "Inspection_2 Part 3 FAILURE: Input data contains NaN values"
        else:
            print('Inspection_2 Part 3 PASS: Data has no NaN values')
        
        # Test 4: Check for duplicate rows
        if self.data.duplicated().any():
            return "Inspection_2 Part 4 CAUTION: Input data contains duplicate rows"
        else:
            print("Inspection_2 Part 4 PASS: No duplicate rows found")
        
        # Test 5: Check processed column dtypes are as expected
        mismatched_columns = []
        for col, dtype in self.processed_dtypes.items():
            if self.data[col].dtype != dtype:
                mismatched_columns.append((col, self.data[col].dtype, dtype))
        
        if mismatched_columns:
            column_info = '\n'.join([f"{col}: expected {expected}, got {actual}" for col, actual, expected in mismatched_columns])
            raise ValueError(f"Inspection_2 Part 2 FAILURE: Columns {column_info}. Unexpected data types.")
        else:
            print('Inspection_2 Part 5 PASS: All columns have expected data types')
        

In [49]:
class RawDataTest(unittest.TestCase):
    
    def setUp(self):
        self.raw_data = pd.read_csv('C:/Users/delst/OneDrive - Queen Mary, University of London/Desktop/VSCode/Advanced_Projects/Uber_Demand_Project/data_archive/uber.csv')
        self.data = self.raw_data[['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime','passenger_count']]
        self.expected_dtypes = {
                            'key': object,
                            'fare_amount': float,
                            'pickup_datetime': object,
                            'passenger_count': np.int64
                            }
        self.processed_dtypes = {
            'key': object,
            'fare_amount': float,
            'pickup_datetime': 'datetime64[ns]',
            'passenger_count': np.int64,
            'Label_Hour': np.int64,
            'Label_Date': object,
            'day_name': object,
            'day_of_the_week': np.int64
            }        

        self.inspect_cols = [
            'passenger_count', 
            'fare_amount'
            ]     
        
    def test_inspection_1_part_1(self):
        self.assertIsNotNone(self.data, "Input data is empty")
        
    def test_inspection_1_part_2(self):
        self.assertFalse(self.data.isnull().values.any(), "Input data contains missing values")
        
    def test_inspection_1_part_3(self):
        self.assertFalse(self.data.isna().values.any(), "Input data contains NaN values")
        
    def test_inspection_1_part_4(self):
        self.assertFalse(self.data.duplicated().any(), "Input data contains duplicate rows")
        
    def test_inspection_1_part_5(self):
        mismatched_columns = []
        for col, dtype in self.expected_dtypes.items():
            if self.data[col].dtype != dtype:
                mismatched_columns.append((col, self.data[col].dtype, dtype))
        
        self.assertEqual(len(mismatched_columns), 0, f"Columns {mismatched_columns} have unexpected data types")
        
    def test_inspection_1_part_6(self):
        outlier_cols = []
        for col in self.inspect_cols:
            z_score = np.abs((self.data[col] - self.data[col].mean()) / self.data[col].std())
            if z_score.max() > 3:
                outlier_cols.append(col)
        self.assertEqual(len(outlier_cols), 0, f"Columns {outlier_cols} have outliers")



In [50]:
class RawDataTest(unittest.TestCase):
    
    def setUp(self):
        self.raw_data = pd.read_csv('C:/Users/delst/OneDrive - Queen Mary, University of London/Desktop/VSCode/Advanced_Projects/Uber_Demand_Project/data_archive/uber.csv')
        self.data = self.raw_data[['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime','passenger_count']]
        self.data = Dataset(self.data, threshold=0.05).data_transform()
        self.data = Dataset(self.data, threshold=0.05).data_remove_outliers()
        self.expected_dtypes = {
                            'key': object,
                            'fare_amount': float,
                            'pickup_datetime': object,
                            'passenger_count': np.int64
                            }
        self.processed_dtypes = {
            'key': object,
            'fare_amount': float,
            'pickup_datetime': 'datetime64[ns]',
            'passenger_count': np.int64,
            'Label_Hour': np.int64,
            'Label_Date': object,
            'day_name': object,
            'day_of_the_week': np.int64
            }        

        self.inspect_cols = [
            'passenger_count', 
            'fare_amount'
            ]     
        
    def test_inspection_1_part_1(self):
        self.assertIsNotNone(self.data, "Input data is empty")
        
    def test_inspection_1_part_2(self):
        self.assertFalse(self.data.isnull().values.any(), "Input data contains missing values")
        
    def test_inspection_1_part_3(self):
        self.assertFalse(self.data.isna().values.any(), "Input data contains NaN values")
        
    def test_inspection_1_part_4(self):
        self.assertFalse(self.data.duplicated().any(), "Input data contains duplicate rows")
        
    def test_inspection_1_part_5(self):
        mismatched_columns = []
        for col, dtype in self.expected_dtypes.items():
            if self.data[col].dtype != dtype:
                mismatched_columns.append((col, self.data[col].dtype, dtype))
        
        self.assertEqual(len(mismatched_columns), 0, f"Columns {mismatched_columns} have unexpected data types")
        
    def test_inspection_1_part_6(self):
        outlier_cols = []
        for col in self.inspect_cols:
            z_score = np.abs((self.data[col] - self.data[col].mean()) / self.data[col].std())
            if z_score.max() > 3:
                outlier_cols.append(col)
        self.assertEqual(len(outlier_cols), 0, f"Columns {outlier_cols} have outliers")


In [51]:
raw_data = df0
data = df

expected_dtypes = {
    'key': object,
    'fare_amount': float,
    'pickup_datetime': object,
    'passenger_count': np.int64
    }

processed_dtypes = {
    'key': object,
    'fare_amount': float,
    'pickup_datetime': 'datetime64[ns]',
    'passenger_count': np.int64,
    'Label_Hour': np.int64,
    'Label_Date': object,
    'day_name': object,
    'day_of_the_week': np.int64
    }

inspect_cols = [
    'passenger_count', 
    'fare_amount'
    ]

col_group = ['pickup_datetime', 'fare_amount', 'passenger_count','Label_Date']   # Group by columns
col_feature = ['fare_amount', 'passenger_count']   # Feature extraction columns
input_columns = ['sPED']
target_col = 'sPED'

eng_feature_cols = ['fare_amount_pct_change', 'passenger_count_pct_change', 
                    'passenger_count_pct_change_sma', 'fare_amount_pct_change_sma'
                    ]

n_window = 3
threshold = 0.05

In [52]:
def pipeline(raw_data, expected_dtypes, inspect_cols):

    print('Initialising Pipeline...')

    print('\nInspecting Raw data...')
    test_inspector = RawDataTest()
    test_inspector.setUp()
    suite = unittest.TestLoader().loadTestsFromModule(test_inspector)
    unittest.TextTestRunner().run(suite)

    print('\nProcessing data...')
    dataset = Dataset(raw_data, threshold=0.05)
    data = dataset.data_transform()
    data = dataset.data_remove_outliers()
    
    print('\nInspecting Processed data...')
    test_inspector = ProcessedDataTest()
    test_inspector.setUp()
    suite = unittest.TestLoader().loadTestsFromModule(test_inspector)
    unittest.TextTestRunner().run(suite)

    return suite

data_output = pipeline(data, expected_dtypes, inspect_cols)
data_output

Initialising Pipeline...

Inspecting Raw data...


....FF
FAIL: test_inspection_1_part_5 (__main__.RawDataTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\delst\AppData\Local\Temp/ipykernel_22408/2801617373.py", line 48, in test_inspection_1_part_5
    self.assertEqual(len(mismatched_columns), 0, f"Columns {mismatched_columns} have unexpected data types")
AssertionError: 1 != 0 : Columns [('pickup_datetime', dtype('<M8[ns]'), <class 'object'>)] have unexpected data types

FAIL: test_inspection_1_part_6 (__main__.RawDataTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\delst\AppData\Local\Temp/ipykernel_22408/2801617373.py", line 56, in test_inspection_1_part_6
    self.assertEqual(len(outlier_cols), 0, f"Columns {outlier_cols} have outliers")
AssertionError: 2 != 0 : Columns ['passenger_count', 'fare_amount'] have outliers

--------------------------------------------------------


Processing data...

Inspecting Processed data...


NameError: name 'ProcessedDataTest' is not defined