In [1]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from statsmodels.tsa.arima.model import ARIMA
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
from sklearn.pipeline import _name_estimators
import numpy as np
import pandas as pd

In [2]:
# Loading the feature-engineered data
df = pd.read_csv('feature_engineered_data.csv')

In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,driverId,tripId,startLocation,tripDistance,tripSpeed,tripDuration,endLocation,startTime,tripFare,...,hour_of_day,day_of_month,month,year,is_peak_hour,is_business_day,15min_interval,30min_interval,60min_interval,1day_interval
0,0,1,0,"[23.075897, 72.483299]",11.8,24.97,28,"[23.075855, 72.598551]",2023-01-01 18:53:00,124,...,18,1,1,2023,1,0,2023-01-01 18:45:00,2023-01-01 18:30:00,2023-01-01 18:00:00,2023-01-01
1,1,1,1,"[23.002, 72.550009]",8.5,15.34,33,"[23.072137, 72.583952]",2023-01-01 20:39:00,91,...,20,1,1,2023,0,0,2023-01-01 20:30:00,2023-01-01 20:30:00,2023-01-01 20:00:00,2023-01-01
2,2,1,2,"[22.929988, 72.578228]",13.4,27.74,29,"[23.050404, 72.578228]",2023-01-01 21:24:00,140,...,21,1,1,2023,0,0,2023-01-01 21:15:00,2023-01-01 21:00:00,2023-01-01 21:00:00,2023-01-01
3,3,1,3,"[23.029391, 72.555201]",6.3,14.47,26,"[23.061703, 72.605368]",2023-01-01 21:58:00,69,...,21,1,1,2023,0,0,2023-01-01 21:45:00,2023-01-01 21:30:00,2023-01-01 21:00:00,2023-01-01
4,4,1,4,"[23.067932, 72.51617]",5.7,13.42,25,"[23.103956, 72.55534]",2023-01-01 22:47:00,63,...,22,1,1,2023,0,0,2023-01-01 22:45:00,2023-01-01 22:30:00,2023-01-01 22:00:00,2023-01-01
5,5,1,5,"[23.050932, 72.493464]",7.0,23.06,18,"[23.108462, 72.521314]",2023-01-02 18:17:00,76,...,18,2,1,2023,1,1,2023-01-02 18:15:00,2023-01-02 18:00:00,2023-01-02 18:00:00,2023-01-02
6,6,1,6,"[22.977277, 72.530901]",7.3,19.47,22,"[23.023437, 72.581065]",2023-01-02 22:59:00,79,...,22,2,1,2023,0,1,2023-01-02 22:45:00,2023-01-02 22:30:00,2023-01-02 22:00:00,2023-01-02
7,7,1,7,"[22.980471, 72.602484]",5.2,25.28,12,"[23.003947, 72.64667]",2023-01-03 19:17:00,58,...,19,3,1,2023,0,1,2023-01-03 19:15:00,2023-01-03 19:00:00,2023-01-03 19:00:00,2023-01-03
8,8,1,8,"[23.050361, 72.532239]",8.9,22.33,24,"[23.106707, 72.593512]",2023-01-03 20:23:00,95,...,20,3,1,2023,0,1,2023-01-03 20:15:00,2023-01-03 20:00:00,2023-01-03 20:00:00,2023-01-03
9,9,1,9,"[22.974242, 72.587366]",3.5,22.68,9,"[22.989934, 72.616895]",2023-01-03 20:45:00,41,...,20,3,1,2023,0,1,2023-01-03 20:45:00,2023-01-03 20:30:00,2023-01-03 20:00:00,2023-01-03


In [None]:
# Split the DataFrame into chunks
chunk_size = 1000
chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]

# Initialize an empty DataFrame to store the results
result_df = pd.DataFrame()

# Create and configure the OneHotEncoder
encoder = OneHotEncoder(sparse=True, drop='first')

# Process each chunk separately and concatenate the results
for chunk in chunks:
    # Identify columns with string values
    string_columns = chunk.select_dtypes(include=['object']).columns
    
    # Perform one-hot encoding only on columns with string values
    encoded_chunk = pd.get_dummies(chunk, columns=string_columns, sparse=True, drop_first=True)

    # Concatenate the encoded chunk to the result DataFrame
    result_df = pd.concat([result_df, encoded_chunk], axis=0, ignore_index=True)

# Display the resulting DataFrame
print(result_df)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44718 entries, 0 to 44717
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       44718 non-null  int64  
 1   driverId         44718 non-null  int64  
 2   tripId           44718 non-null  int64  
 3   startLocation    44718 non-null  object 
 4   tripDistance     44718 non-null  float64
 5   tripSpeed        44718 non-null  float64
 6   tripDuration     44718 non-null  int64  
 7   endLocation      44718 non-null  object 
 8   startTime        44718 non-null  object 
 9   tripFare         44718 non-null  int64  
 10  paymentType      44718 non-null  object 
 11  endTime          44718 non-null  object 
 12  latitude         44718 non-null  float64
 13  longitude        44718 non-null  float64
 14  latitude_bin     44718 non-null  object 
 15  longitude_bin    44718 non-null  object 
 16  time_interval    44718 non-null  object 
 17  day_of_week 

In [5]:
# Drop 'Unnamed: 0' from the DataFrame
df = df.drop(columns=['Unnamed: 0'])

In [17]:
# Separate categorical and numerical features
location_strings = ['startLocation', 'endLocation']
categorical_features = ['15min_interval', '30min_interval', '60min_interval', '1day_interval']
numerical_features = [col for col in df.columns 
                      if col not in categorical_features 
                      and col not in location_strings 
                      and col != 'tripFare']


In [18]:
print(numerical_features)

['driverId', 'tripId', 'tripDistance', 'tripSpeed', 'tripDuration', 'startTime', 'paymentType', 'endTime', 'latitude', 'longitude', 'latitude_bin', 'longitude_bin', 'time_interval', 'day_of_week', 'hour_of_day', 'day_of_month', 'month', 'year', 'is_peak_hour', 'is_business_day']


In [19]:
class LocationConverter(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for column in self.columns:
            if column in X.columns:
                X[column + '_latitude'] = X[column].apply(lambda x: float(x[0]) if isinstance(x, list) else None)
                X[column + '_longitude'] = X[column].apply(lambda x: float(x[1]) if isinstance(x, list) else None)
                X = X.drop(columns=[column])
        return X

In [20]:
# # Create location_converter
# location_converter = LocationConverter(columns=['startLocation', 'endLocation'])


In [21]:
# df = df.drop(columns=['startLocation', 'endLocation'])

In [22]:
# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(drop='first'))])



In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
#         ('convert', LocationConverter(columns=location_strings), location_strings)
    ],
    remainder='passthrough'
)

In [24]:
print(preprocessor)

ColumnTransformer(remainder='passthrough',
                  transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['driverId', 'tripId', 'tripDistance',
                                  'tripSpeed', 'tripDuration', 'startTime',
                                  'paymentType', 'endTime', 'latitude',
                                  'longitude', 'latitude_bin', 'longitude_bin',
                                  'time_interval', 'day_of_week', 'hour_of_day',
                                  'day_of_month', 'month', 'year',
                                  'is_peak_hour', 'is_business_day']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(

In [25]:
df.columns

Index(['driverId', 'tripId', 'startLocation', 'tripDistance', 'tripSpeed',
       'tripDuration', 'endLocation', 'startTime', 'tripFare', 'paymentType',
       'endTime', 'latitude', 'longitude', 'latitude_bin', 'longitude_bin',
       'time_interval', 'day_of_week', 'hour_of_day', 'day_of_month', 'month',
       'year', 'is_peak_hour', 'is_business_day', '15min_interval',
       '30min_interval', '60min_interval', '1day_interval'],
      dtype='object')

In [26]:
X = df.drop(columns=['tripFare', 'startLocation', 'endLocation'])  # Exclude location columns
y = df['tripFare']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [27]:
# Apply preprocessing to training and testing sets
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: '2023-02-20 20:17:00'

# Now Let's build XGBoost, DNN and ARIMA models, although it is not necessary I will be following OOPS concepts

In [None]:

# # A custom transformer to extract features from lists
# class ListFeatureExtractor(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         # Assuming X is a DataFrame with 'latitude_bin' and 'longitude_bin' columns
#         X_copy = X.copy()
        
#         # Extract features from lists
#         X_copy['latitude'] = X_copy['latitude_bin'].apply(lambda x: eval(x)[0] if isinstance(x, str) else x)
#         X_copy['longitude'] = X_copy['longitude_bin'].apply(lambda x: eval(x)[1] if isinstance(x, str) else x)
        
#         # Drop the original 'latitude_bin' and 'longitude_bin' columns
#         X_copy = X_copy.drop(columns=['latitude_bin', 'longitude_bin'])
        
#         return X_copy[['latitude', 'longitude']]

# class DemandForecastingModel:
#     def __init__(self, df):
#         self.df = df
#         self.features = [
#             'day_of_week', 'hour_of_day', 'day_of_month', 'month', 'year',
#             'latitude_bin', 'longitude_bin',
#             'is_peak_hour', 'is_business_day',
#             '15min_interval', '30min_interval', '60min_interval', '1day_interval'
#         ]

#     def prepare_data(self):
#         # Drop 'Unnamed: 0' from the DataFrame
#         self.df = self.df.drop(columns=['Unnamed: 0'])

#         # Separate categorical and numerical features
#         categorical_features = ['15min_interval', '30min_interval', '60min_interval', '1day_interval']
#         numerical_features = [col for col in self.df.columns if col not in categorical_features + ['tripFare']]

#         # Create transformers for numerical and categorical features
#         numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),])

#         categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), 
#                                                   ('onehot', OneHotEncoder(drop='first'))])

#         # Create a transformer for extracting features from lists
#         list_feature_extractor = ListFeatureExtractor()

#         # Create numeric_converter
#         numeric_converter = StringToNumericConverter(columns=['starLocation'])

#         # Use ColumnTransformer to apply transformers to specific columns
#         preprocessor = ColumnTransformer(
#             transformers=[
#                 ('num', numerical_transformer, numerical_features),
#                 ('cat', categorical_transformer, categorical_features),
#                 ('convert', numeric_converter, ['startLocation'])  # Add this line
#             ],
#             remainder='passthrough'  # Include remaining columns as-is
#         )

#         # Split the data and apply preprocessing
#         X = self.df.drop(columns=['tripFare'])
#         y = self.df['tripFare']
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#         # Apply preprocessing to training and testing sets
#         X_train = preprocessor.fit_transform(X_train)
#         X_test = preprocessor.transform(X_test)

#         return X_train, X_test, y_train, y_test



#     def train_xgboost(self, X_train, X_test, y_train, y_test):
#         model = XGBRegressor(random_state=42)
#         model.fit(X_train, y_train)
#         predictions = model.predict(X_test)
#         rmse = mean_squared_error(y_test, predictions, squared=False)
#         print(f'XGBoost RMSE: {rmse}')

#     def train_arima(self, y_train, y_test):
#         model = ARIMA(y_train, order=(5, 1, 0))
#         fit = model.fit()
#         predictions = fit.forecast(steps=len(y_test))
#         rmse = mean_squared_error(y_test, predictions, squared=False)
#         print(f'ARIMA RMSE: {rmse}')

#     def train_dnn(self, X_train, X_test, y_train, y_test):
#         scaler = StandardScaler()
#         X_train_scaled = scaler.fit_transform(X_train)
#         X_test_scaled = scaler.transform(X_test)
        
#         model = MLPRegressor(random_state=42, max_iter=1000)
#         model.fit(X_train_scaled, y_train)
#         predictions = model.predict(X_test_scaled)
#         rmse = mean_squared_error(y_test, predictions, squared=False)
#         print(f'DNN RMSE: {rmse}')

# if __name__ == "__main__":
#     model_instance = DemandForecastingModel(df)
#     X_train, X_test, y_train, y_test = model_instance.prepare_data()

#     model_instance.train_xgboost(X_train, X_test, y_train, y_test)
#     model_instance.train_arima(y_train, y_test)
#     model_instance.train_dnn(X_train, X_test, y_train, y_test)
