In [22]:
import pandas as pd 
import numpy as np

In [23]:
df = pd.read_csv("train.csv")

In [24]:
df.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [25]:
df.shape

(1458644, 11)

In [26]:
df.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [27]:
df.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


## Data Cleaning and Filtering 

In [29]:
# dropping the Columns
df = df.drop(columns = ['id','store_and_fwd_flag'])

In [30]:
df.shape

(1458644, 9)

In [31]:
# Removing outliers in the Trip_duration data

lower, upper = df['trip_duration'].quantile([0.01,0.99])
df = df[(df['trip_duration']>=lower) & (df['trip_duration'] <= upper)]

In [32]:
df.shape

(1429538, 9)

## Feature Engineering

In [33]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_weekday'] = df['pickup_datetime'].dt.weekday

In [34]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1,lat1,lon2,lat2):
    
    lon1, lat1, lon2, lat1 = map(radians, [lon1,lat1,lon2,lat2])
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a  = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2*asin(sqrt(a))
    r = 6371       # Here i have used the mean radious of the earth
    return c*r


Note: Haversine distance calculates the great-circle distance between two points based on their latitude and longitude.

In [None]:
# df["distance_km"]= df.apply(lambda row: haversine(
#     row['pickup_longitude'],row['pickup_latitude'],
#     row['dropoff_longitude'],row['dropoff_latitude']
# ),axis=1)

In [36]:
df.sample(2)

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_hour,pickup_day,pickup_month,pickup_weekday,distance_km
942139,1,2016-06-29 11:41:37,2016-06-29 11:51:52,1,-73.983566,40.746513,-73.975098,40.763535,615,11,29,6,2,14990.738732
1158437,2,2016-05-25 18:39:29,2016-05-25 18:49:17,1,-73.964897,40.775295,-73.95137,40.782768,588,18,25,5,2,15111.137849


In [37]:
df_model = df[[
    'vendor_id', 'passenger_count',
    'pickup_hour', 'pickup_day', 'pickup_month', 'pickup_weekday',
    'distance_km',
    'trip_duration'
]]

In [38]:
df_model

Unnamed: 0,vendor_id,passenger_count,pickup_hour,pickup_day,pickup_month,pickup_weekday,distance_km,trip_duration
0,2,1,17,14,3,0,15003.681048,455
1,1,1,0,12,6,6,14788.027394,663
2,2,1,11,19,1,1,14656.165799,2124
3,2,1,19,6,4,2,14635.081083,429
4,2,1,13,26,3,5,15109.585890,435
...,...,...,...,...,...,...,...,...
1458639,2,4,13,8,4,4,14844.478168,778
1458640,1,1,7,10,1,6,15197.388921,655
1458641,2,1,6,22,4,4,14639.162439,764
1458642,1,1,15,5,1,1,14950.502194,373


## Encodin and Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

df_model['vendor_id'] = df_model['vendor_id'].astype("category")
scaler= StandardScaler()
df_model[['distance_km','passanger_count']] = scaler.fit_transform(df_model[['distance_km','passenger_count']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['vendor_id'] = df_model['vendor_id'].astype("category").cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model[['distance_km','passanger_count']] = scaler.fit_transform(df_model[['distance_km','passenger_count']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model[['distanc

## Train test split

In [41]:
from sklearn.model_selection import train_test_split
x = df_model.drop(columns=['trip_duration'])
y = df_model['trip_duration']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=23)

In [43]:
! pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 524.3 kB/s eta 0:04:46
   ---------------------------------------- 0.8/150.0 MB 729.2 kB/s eta 0:03:25
   ---------------------------------------- 1.0/150.0 MB 915.5 kB/s eta 0:02:43
   ---------------------------------------- 1.6/150.0 MB 1.2 MB/s eta 0:02:03
    --------------------------------------- 2.1/150.0 MB 1.5 MB/s eta 0:01:39
    --------------------------------------- 2.6/150.0 MB 1.5 MB/s eta 0:01:39
    ------------


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\USER\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [45]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

model= XGBRegressor(n_estimators=100, max_depth=6)
model.fit(x_train,y_train)

y_pred= model.predict(x_test)
rmsle = mean_squared_log_error(y_test,y_pred)
print(f"rmsle is {rmsle: .4f}")

rmsle is  0.4538


In [46]:
r2_score(y_test,y_pred)

0.1885969042778015

In [48]:
from sklearn.tree import DecisionTreeRegressor

In [49]:
dt = DecisionTreeRegressor()
dt.fit(x_train,y_train)
ypdt = dt.predict(x_test)
print("r2_score", r2_score(y_test,y_pred))

r2_score 0.1885969042778015


In [50]:
r2_score(y_test,ypdt)

-0.6737403250082756

In [51]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)
lrpt = lr.predict(x_test)
print(r2_score(y_test,lrpt))

0.0325886366121525


In [52]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
rfpt= rf.predict(x_test)
r2_score(y_test,rfpt)

KeyboardInterrupt: 