In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics
from math import *
from geopy import Point , distance
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('uber.csv')

In [3]:
df.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [4]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
5,44470845,2011-02-12 02:27:09.0000006,4.9,2011-02-12 02:27:09 UTC,-73.969019,40.75591,-73.969019,40.75591,1
6,48725865,2014-10-12 07:04:00.0000002,24.5,2014-10-12 07:04:00 UTC,-73.961447,40.693965,-73.871195,40.774297,5
7,44195482,2012-12-11 13:52:00.00000029,2.5,2012-12-11 13:52:00 UTC,0.0,0.0,0.0,0.0,1
8,15822268,2012-02-17 09:32:00.00000043,9.7,2012-02-17 09:32:00 UTC,-73.975187,40.745767,-74.00272,40.743537,1
9,50611056,2012-03-29 19:06:00.000000273,12.5,2012-03-29 19:06:00 UTC,-74.001065,40.741787,-73.96304,40.775012,1


In [5]:
df.shape

(200000, 9)

In [6]:
df.isna().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [7]:
df = df.dropna()

In [8]:
df.drop(["Unnamed: 0",'key'],axis=1,inplace=True)

In [9]:
df.shape

(199999, 7)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199999 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        199999 non-null  float64
 1   pickup_datetime    199999 non-null  object 
 2   pickup_longitude   199999 non-null  float64
 3   pickup_latitude    199999 non-null  float64
 4   dropoff_longitude  199999 non-null  float64
 5   dropoff_latitude   199999 non-null  float64
 6   passenger_count    199999 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 12.2+ MB


In [11]:
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'],errors='coerce')

In [12]:
def distance_covered(longitude1 , latitude1 , longitude2 , latitude2):
    distance =[]
    for pos in range(len(longitude1)):
        long1 , lat1 ,long2 ,lat2 = map(radians ,[longitude1[pos],latitude1[pos],longitude2[pos],latitude2[pos]])
        dis_long = long2-long1
        dis_lat = lat2-lat1
        a = (sin(dis_lat/2)**2) + cos(lat1) * cos(lat2) * sin(dis_long/2)**2
        c = asin(sqrt(a))*6371
        distance.append(c)
    return distance

In [13]:
df["distance_km"] = distance_covered(df['pickup_longitude'].to_numpy(),df['pickup_latitude'].to_numpy(),df['dropoff_longitude'].to_numpy(),df['dropoff_longitude'].to_numpy())

In [14]:
df = df.assign(pickup_hr = df.pickup_datetime.dt.hour,
               day = df.pickup_datetime.dt.day,
               year = df.pickup_datetime.dt.year,
               month = df.pickup_datetime.dt.month,
               dat_name = df.pickup_datetime.dt.day_name())


In [15]:
df.head(5)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_km,pickup_hr,day,year,month,dat_name
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,6379.134266,19,7,2015,5,Thursday
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,6378.304171,20,17,2009,7,Friday
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,6377.214662,21,24,2009,8,Monday
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,6380.15141,8,26,2009,6,Friday
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,6377.983743,17,28,2014,8,Thursday


In [16]:
df['passenger_count'].value_counts()

1      138425
2       29428
5       14009
3        8881
4        4276
6        4271
0         708
208         1
Name: passenger_count, dtype: int64