In [2]:
# BOOTSTRAP AGGREGATION : its a bagging technique that uses stumps just like ranfom forest. 

# -> baggingregressor() - used for regression
# -> baggingclassifier() - used for classification

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier # for classification
from sklearn.ensemble import BaggingRegressor # for regression

from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from geopy.distance import distance

import dill     #to save the session into a database so that we don't have to re-execute the file again after closing it. 

In [4]:
# dataset
df = pd.read_csv("uber (1).csv")
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [5]:
df.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [7]:
df[df.isnull().any(axis = 1)]   #extracting the rows that has null values

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
87946,32736015,2013-07-02 03:51:57.0000001,24.1,2013-07-02 03:51:57 UTC,-73.950581,40.779692,,,0


In [8]:
df.dropna(inplace = True) #dropping rows with null values

In [9]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime) #converting object type to datetime type
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199999 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   Unnamed: 0         199999 non-null  int64              
 1   key                199999 non-null  object             
 2   fare_amount        199999 non-null  float64            
 3   pickup_datetime    199999 non-null  datetime64[ns, UTC]
 4   pickup_longitude   199999 non-null  float64            
 5   pickup_latitude    199999 non-null  float64            
 6   dropoff_longitude  199999 non-null  float64            
 7   dropoff_latitude   199999 non-null  float64            
 8   passenger_count    199999 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(2), object(1)
memory usage: 15.3+ MB


In [10]:
# range of latitude : [-90, +90]
# range of longitude : [-180, +180]

# removing all the rows where the values are out of range
df.drop(index = df[(df.pickup_latitude > 90) | (df.pickup_latitude < -90) | (df.pickup_longitude > 180) | (df.pickup_longitude < -180) | (df.dropoff_latitude > 90) | (df.dropoff_latitude < -90) | (df.dropoff_longitude > 180) | (df.pickup_longitude < -180)].index, inplace=True)


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199988 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   Unnamed: 0         199988 non-null  int64              
 1   key                199988 non-null  object             
 2   fare_amount        199988 non-null  float64            
 3   pickup_datetime    199988 non-null  datetime64[ns, UTC]
 4   pickup_longitude   199988 non-null  float64            
 5   pickup_latitude    199988 non-null  float64            
 6   dropoff_longitude  199988 non-null  float64            
 7   dropoff_latitude   199988 non-null  float64            
 8   passenger_count    199988 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(2), object(1)
memory usage: 15.3+ MB


In [36]:
df['distance'] = [(distance(
    (df.pickup_latitude[i],df.pickup_longitude[i]),
 (df.dropoff_latitude[i],df.dropoff_longitude[i]))).km for i in df.index]

In [37]:
df.drop(columns = ['Unnamed: 0', 'key', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude' , 'dropoff_longitude'], inplace = True)
df.reset_index()
df.head(3)

Unnamed: 0,fare_amount,pickup_datetime,passenger_count,distance
0,7.5,2015-05-07 19:52:06+00:00,1,1.681111
1,7.7,2009-07-17 20:04:56+00:00,1,2.454363
2,12.9,2009-08-24 21:45:00+00:00,1,5.039603


In [39]:
df['year'] = df.pickup_datetime.dt.year
df['month'] = df.pickup_datetime.dt.month
df['weekday'] = df.pickup_datetime.dt.weekday
df['hour'] = df.pickup_datetime.dt.hour

df.head(3)

Unnamed: 0,fare_amount,pickup_datetime,passenger_count,distance,year,month,weekday,hour
0,7.5,2015-05-07 19:52:06+00:00,1,1.681111,2015,5,3,19
1,7.7,2009-07-17 20:04:56+00:00,1,2.454363,2009,7,4,20
2,12.9,2009-08-24 21:45:00+00:00,1,5.039603,2009,8,0,21


In [60]:
df['monthly_quarter'] = pd.cut(df.month, bins = [1, 3, 6, 9, 12], labels=['q1', 'q2', 'q3', 'q4'])
df['hour_segment'] = pd.cut(df.hour, bins = [0, 3, 7, 11, 15, 19, 23], labels = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

df.drop(['pickup_datetime'], axis = 1, inplace = True)

df.head(3)

Unnamed: 0,fare_amount,passenger_count,distance,year,month,weekday,hour,monthly_quarter,hour_segment
0,7.5,1,1.681111,2015,5,3,19,q2,h5
1,7.7,1,2.454363,2009,7,4,20,q3,h6
2,12.9,1,5.039603,2009,8,0,21,q3,h6


In [61]:
# encoding

columns = ['monthly_quarter', 'hour_segment']

encoder = LabelEncoder()
for i in columns:
    df[i] = encoder.fit_transform(df[i])

In [62]:
x = df.drop(columns = ['fare_amount'])
y = df['fare_amount']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

### using BaggingRegressor

In [64]:
base_estimator = RandomForestRegressor()

bagging = BaggingRegressor(
    base_estimator=base_estimator, 
    n_estimators=10,
    max_samples=1.0,
    max_features=1.0,
    bootstrap=True,
    verbose=0
)

In [65]:
bagging.fit(x_train, y_train)



In [67]:
y_pred = bagging.predict(x_test)

In [70]:
mean_squared_error(y_test, y_pred)

25.89667434150993

In [73]:
r2_score(y_test, y_pred)

0.7382668161402948