In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("uber_rides.csv")

In [3]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [4]:
df.shape

(200000, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


In [6]:
df['dropoff_longitude'].isnull().sum()


1

In [7]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   ride_id            200000 non-null  int64              
 1   fare_amount        200000 non-null  float64            
 2   pickup_datetime    200000 non-null  datetime64[ns, UTC]
 3   pickup_longitude   200000 non-null  float64            
 4   pickup_latitude    200000 non-null  float64            
 5   dropoff_longitude  199999 non-null  float64            
 6   dropoff_latitude   199999 non-null  float64            
 7   passenger_count    200000 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(2)
memory usage: 12.2 MB


In [11]:
df = df.dropna()

In [13]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [14]:
df.describe()

Unnamed: 0,ride_id,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0
mean,27712480.0,11.359892,-72.527631,39.935881,-72.525292,39.92389,1.684543
std,16013860.0,9.90176,11.437815,7.720558,13.117408,6.794829,1.385995
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825340.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745240.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555350.0,12.5,-73.967154,40.767158,-73.963659,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [15]:
import math
def haversine_dist(row):
    R = 6371  # Radius of the Earth in kilometers
    lat1, lon1 = math.radians(row['pickup_latitude']), math.radians(
        row['pickup_longitude'])
    lat2, lon2 = math.radians(row['dropoff_latitude']), math.radians(
        row['dropoff_longitude'])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * \
        math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = R * c
    return distance


In [17]:
df['haversine_dist'] = df.apply(haversine_dist, axis=1)


In [18]:
# calculating the median distance
median_distance = np.median(df['haversine_dist'])
print("Median Haversine Distance:", median_distance, "kilometers")


Median Haversine Distance: 2.1209923961833708 kilometers


In [19]:
df.describe()

Unnamed: 0,ride_id,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_dist
count,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0
mean,27712480.0,11.359892,-72.527631,39.935881,-72.525292,39.92389,1.684543,20.85535
std,16013860.0,9.90176,11.437815,7.720558,13.117408,6.794829,1.385995,382.964642
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0,0.0
25%,13825340.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0,1.215222
50%,27745240.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0,2.120992
75%,41555350.0,12.5,-73.967154,40.767158,-73.963659,40.768001,2.0,3.875169
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0,16409.239135


In [20]:
# counting the numbers of rides with a Haversine distance as zeroes
zero_distance_ride = df[df['haversine_dist'] == 0]
print(len(zero_distance_ride))

5632


In [21]:
# Calculate the mean 'fare_amount' for rides with 0.0 Haversine Distance
mean_fare_for_zero_distance_rides = zero_distance_ride['fare_amount'].mean()

print("Mean 'fare_amount' for rides with 0.0 Haversine Distance:",
      mean_fare_for_zero_distance_rides)


Mean 'fare_amount' for rides with 0.0 Haversine Distance: 11.585317826704546


In [26]:
import math

# Define the Haversine distance function


def calculate_haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * \
        math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = R * c
    return distance


# Find the ride with the highest 'fare_amount'
costliest_ride = df[df['fare_amount'] == df['fare_amount'].max()]

# Calculate the Haversine distance for the costliest ride
haversine_dist_costliest_ride = calculate_haversine_distance(costliest_ride.iloc[0]['pickup_latitude'],
                                                             costliest_ride.iloc[0]['pickup_longitude'],
                                                             costliest_ride.iloc[0]['dropoff_latitude'],
                                                             costliest_ride.iloc[0]['dropoff_longitude'])

print("Haversine Distance for the Costliest Ride:",
      haversine_dist_costliest_ride, "per kilometers")


Haversine Distance for the Costliest Ride: 0.0007899213191009994 kilometers


In [27]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_dist
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2.45759
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545


In [28]:
# Creating a  separate columns for year, month, and week
df['year'] = df['pickup_datetime'].dt.year
df['month'] = df['pickup_datetime'].dt.month
df['week'] = df['pickup_datetime'].dt.week

  df['week'] = df['pickup_datetime'].dt.week


In [29]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_dist,year,month,week
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,2015,5,19
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,2009,7,29
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,2009,8,35
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,2009,6,26
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,2014,8,35


In [30]:
ride_2014 = df[df['year'] == 2014]
print(len(ride_2014))

29968


In [32]:
ride_q1_2014 = df[(df['year'] == 2014) & (df['month'] >= 1 ) & (df['month'] <= 3)]
print(len(ride_q1_2014))

7687


In [33]:
ride_sept_2010 = df[(df['year'] == 2010) & (df['month'] == 9)]
ride_sept_2010['day_of_week'] = ride_sept_2010['pickup_datetime'].dt.day_name()
rides_by_day = ride_sept_2010['day_of_week'].value_counts()
# Find the day with the maximum recorded rides
max_rides_day = rides_by_day.idxmax()
max_rides_count = rides_by_day.max()

print("On which day of the week in September 2010, maximum rides were recorded?")
print("Day:", max_rides_day)
print("Number of rides:", max_rides_count)


On which day of the week in September 2010, maximum rides were recorded?
Day: Thursday
Number of rides: 457


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ride_sept_2010['day_of_week'] = ride_sept_2010['pickup_datetime'].dt.day_name()


In [41]:
# importing more libraries

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [36]:
df.columns

Index(['ride_id', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'haversine_dist', 'year', 'month', 'week'],
      dtype='object')

In [37]:
# splitting the data into X and y
X = df[['passenger_count', 'haversine_dist', 'week']]
y = df['fare_amount']


In [38]:
X.head()

Unnamed: 0,passenger_count,haversine_dist,week
0,1,1.683323,19
1,1,2.45759,29
2,1,5.036377,35
3,3,1.661683,26
4,5,4.47545,35


In [39]:
y.head()

0     7.5
1     7.7
2    12.9
3     5.3
4    16.0
Name: fare_amount, dtype: float64

In [40]:
# splitting the data for test and train in (30-70)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)


In [42]:
# building the model 
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'KNN Regression': KNeighborsRegressor()
}


In [43]:
adjusted_r2_scores = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate R-squared
    r2 = r2_score(y_test, y_pred)

    # Calculate adjusted R-squared
    n = X_test.shape[0]  # Number of samples
    p = X_test.shape[1]  # Number of predictors
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    adjusted_r2_scores[model_name] = adjusted_r2


In [44]:
# Finding the model with the least adjusted R-squared
worst_model = min(adjusted_r2_scores, key=adjusted_r2_scores.get)

print("Model with the least adjusted R-squared:", worst_model)


Model with the least adjusted R-squared: Linear Regression
