In [2]:
import numpy as np
import pandas as pd

In [10]:
df = pd.read_excel('/content/uber_rides_data.xlsx')

In [11]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


# What is the shape of given dataset?


In [12]:
df.shape

(200000, 8)

# How many integer columns(by default) are given in the dataset?


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


#How many missing values exists in 'dropoff_longitude' column?

In [14]:
df.isnull().sum()

Unnamed: 0,0
ride_id,0
fare_amount,0
pickup_datetime,0
pickup_longitude,0
pickup_latitude,0
dropoff_longitude,1
dropoff_latitude,1
passenger_count,0


# Remove the null values from the dataframe to answer the following question.


In [16]:
df.dropna(inplace=True)
df.isnull().sum()

Unnamed: 0,0
ride_id,0
fare_amount,0
pickup_datetime,0
pickup_longitude,0
pickup_latitude,0
dropoff_longitude,0
dropoff_latitude,0
passenger_count,0


#What is the average fare amount?


In [18]:
print(f'average fare amount:{df.fare_amount.mean()}')


average fare amount:11.359891549457748


# Calculate distance between each pickup and dropoff points using Haversine formula. What is the median haversine distance between pickup and dropoff location according to the given dataset?

In [20]:

def haversine(lat1, lon1, lat2, lon2):
  # convert decimal degrees to radians
  lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

  # haversine formula
  dlon = lon2 - lon1
  dlat = lat2 - lat1
  a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
  c = 2 * np.arcsin(np.sqrt(a))
  r = 6371 # Radius of earth in kilometers. Use 3956 for miles
  return c * r

df['haversine_distance'] = df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

print(f"Median Haversine Distance: {df['haversine_distance'].median().round(2)}")


Median Haversine Distance: 2.12


# What is the maximum haversine distance between pickup and dropoff location according to the given dataset?

In [21]:
print(f"Maximum Haversine Distance: {df['haversine_distance'].max().round(2)}")


Maximum Haversine Distance: 16409.24


# How many rides have 0.0 haversine distance between pickup and dropoff location according to the given dataset?

In [22]:
print(f"Number of rides with 0.0 Haversine distance: {(df['haversine_distance'] == 0.0).sum()}")

Number of rides with 0.0 Haversine distance: 5632


# What is the mean 'fare_amount' for rides with 0 haversine distance?Do you sense something fishy? Try to analyze, and give your expert opinion in Jupyter Notebook.

In [None]:
mean_zero_distance = df[df['haversine_distance'] == 0]['fare_amount'].mean()
print(f"Mean 'fare_amount' for rides with 0 haversine distance: {mean_zero_distance}")

# Here 0 haversine distance also have rides amount. Ride with a distance of 0 would likely mean that the passenger either didn't complete the ride or cancelled it before moving. This is unusal and might be occurece because of following reasons:
    # -- Error while data entry
    # -- Incomplete data (ride might be missing or incorrectly logged)
    # -- might be fraud activities on rides
    # -- Might be system Glitch


# What is the maximum 'fare_amount' for a ride?

In [23]:
print(f"Maximum 'fare_amount' for a ride: {df['fare_amount'].max()}")

Maximum 'fare_amount' for a ride: 499.0


#What is the haversine distance between pickup and dropoff location for the costliest ride?

In [27]:
#costliest_ride = df[df['fare_amount'] == df['fare_amount'].max()]
#haversine_distance_costliest_ride = costliest_ride['haversine_distance'].values[0]
print(f"Haversine distance for the costliest ride: {df[df['fare_amount'] == df['fare_amount'].max()]['haversine_distance'].values[0].round(5)}")

# The haversine distance for the costliest ride is relatively higher compared to the haversine distance of other rides,
#this might indicate a ride is overcharged which is unusual and we must check the details on these rides.

# The costliest ride  should be investigated more.
# Factors that need further investigation:
# * if the fare amount is justified for the distance,
# * if there were any surcharges applied during the ride.
# * To know the cause, we should analyse more data and maybe other attributes if we have those.



Haversine distance for the costliest ride: 0.00079


#How many rides were recorded in the year 2014

In [30]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['pick_year'] = df['pickup_datetime'].dt.year
ride_2014 = df[df['pick_year'] == 2014]
print(f"rides were recorded in the year 2014: {len(ride_2014)}")

rides were recorded in the year 2014: 29968


# How many rides were recorded in the first quarter of 2014?


In [33]:
df['pick_quater'] = df['pickup_datetime'].dt.quarter
ride_quater_2014 = df[(df['pick_year'] == 2014) & (df['pick_quater'] == 1)]
print(f"rides were recorded in the first quarter of 2014: {len(ride_quater_2014)}")

rides were recorded in the first quarter of 2014: 7687


# On which day of the week in September 2010, maximum rides were recorded ?

In [42]:
df['pick_day'] = df['pickup_datetime'].dt.day_name()
rides_sep_2010 = df[(df['pick_year']== 2014) & (df['pickup_datetime'].dt.month == 9)]
print(rides_sep_2010['pick_day'].value_counts().idxmax())
# OR
print(rides_sep_2010['pick_day'].mode()[0])

Tuesday
Tuesday


#Apply a Machine Learning Algorithm to predict the fare amount given following input features:passenger_count, distance and ride_week_day.Perform a 70-30 split of data.
#Which algorithm gives the least adjusted R square value?

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd

# Assuming 'df' is defined earlier in your code
X = df[['passenger_count', 'haversine_distance', 'pick_day']]
y = df['fare_amount']
X = pd.get_dummies(X, columns=['pick_day'], drop_first=True)
# Split data into 70-30 train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Models list, with correct instantiation of KNeighborsRegressor
models = [LinearRegression(), RandomForestRegressor(), DecisionTreeRegressor(), KNeighborsRegressor()]

output = []

# Train models and compute adjusted R-squared value
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    adjusted_r2 = 1 - (1 - r2) * ((X_test.shape[0] - 1) / (X_test.shape[0] - X_test.shape[1] - 1))
    output.append({"Model": model.__class__.__name__, "R2": r2, "Adjusted_R2": adjusted_r2})

# Convert results to a DataFrame
results_df = pd.DataFrame(output)

# Find the model with the least Adjusted R-squared value
least_adj_r2_model = results_df[results_df['Adjusted_R2'] == results_df['Adjusted_R2'].min()]

# Print results
print("Results:\n", results_df)
print("Model with the least Adjusted R-squared:\n", least_adj_r2_model)


Results:
                    Model        R2  Adjusted_R2
0       LinearRegression  0.000746     0.000613
1  RandomForestRegressor  0.629154     0.629104
2  DecisionTreeRegressor  0.483887     0.483818
3    KNeighborsRegressor  0.634158     0.634109
Model with the least Adjusted R-squared:
               Model        R2  Adjusted_R2
0  LinearRegression  0.000746     0.000613
