In [3]:
import pandas as pd
data = pd.read_csv('uber_rides_data.xlsx - sample_train.csv')

In [4]:
data.shape

(200000, 8)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


In [6]:
missing_values_dropoff_longitude = data['dropoff_longitude'].isnull().sum()
print(missing_values_dropoff_longitude)

1


In [7]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])

In [8]:
data_cleaned = data.dropna()
average_fare_amount = data_cleaned['fare_amount'].mean()
print(average_fare_amount)

11.359891549457748


In [9]:
import numpy as np

def haversine_distance(lat1, lon1, lat2, lon2):
  R = 6371  

  lat1_rad = np.radians(lat1)
  lon1_rad = np.radians(lon1)
  lat2_rad = np.radians(lat2)
  lon2_rad = np.radians(lon2)

  dlon = lon2_rad - lon1_rad
  dlat = lat2_rad - lat1_rad

  a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
  c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

  distance = R * c
  return distance

data_cleaned['haversine_distance'] = data_cleaned.apply(lambda row: haversine_distance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

median_haversine_distance = data_cleaned['haversine_distance'].median()
print(median_haversine_distance)


2.1209923961833708


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['haversine_distance'] = data_cleaned.apply(lambda row: haversine_distance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)


In [10]:
max_haversine_distance = data_cleaned['haversine_distance'].max()
print(max_haversine_distance)

16409.239135313168


In [11]:
zero_distance_rides = data_cleaned[data_cleaned['haversine_distance'] == 0.0]
number_of_zero_distance_rides = len(zero_distance_rides)
print(number_of_zero_distance_rides)

5632


In [12]:
mean_fare_amount_zero_distance = zero_distance_rides['fare_amount'].mean()
print(mean_fare_amount_zero_distance)

11.585317826704546


In [13]:
max_fare_amount = data_cleaned['fare_amount'].max()
print(max_fare_amount)

499.0


In [14]:
costliest_ride = data_cleaned[data_cleaned['fare_amount'] == data_cleaned['fare_amount'].max()]
haversine_distance_costliest_ride = costliest_ride['haversine_distance'].values[0]
print(haversine_distance_costliest_ride)

0.0007899213191009994


In [15]:
rides_2014 = data_cleaned[data_cleaned['pickup_datetime'].dt.year == 2014]
number_of_rides_2014 = len(rides_2014)
print(number_of_rides_2014)

29968


In [16]:
rides_first_quarter_2014 = data_cleaned[(data_cleaned['pickup_datetime'].dt.year == 2014) & (data_cleaned['pickup_datetime'].dt.quarter == 1)]
number_of_rides_first_quarter_2014 = len(rides_first_quarter_2014)
print(number_of_rides_first_quarter_2014)

7687


In [18]:
rides_september_2010 = data_cleaned[(data_cleaned['pickup_datetime'].dt.year == 2010) & (data_cleaned['pickup_datetime'].dt.month == 9)]
day_of_week_counts = rides_september_2010['pickup_datetime'].dt.day_name().value_counts()
max_rides_day = day_of_week_counts.idxmax()
print(max_rides_day)

Thursday


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

data_cleaned['ride_week_day'] = data_cleaned['pickup_datetime'].dt.dayofweek

X = data_cleaned[['passenger_count', 'haversine_distance', 'ride_week_day']]
y = data_cleaned['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor()
random_forest_reg = RandomForestRegressor()

linear_reg.fit(X_train, y_train)
decision_tree_reg.fit(X_train, y_train)
random_forest_reg.fit(X_train, y_train)

y_pred_linear = linear_reg.predict(X_test)
y_pred_decision_tree = decision_tree_reg.predict(X_test)
y_pred_random_forest = random_forest_reg.predict(X_test)

r2_linear = r2_score(y_test, y_pred_linear)
r2_decision_tree = r2_score(y_test, y_pred_decision_tree)
r2_random_forest = r2_score(y_test, y_pred_random_forest)

print(f"Linear Regression R-squared: {r2_linear}")
print(f"Decision Tree Regression R-squared: {r2_decision_tree}")
print(f"Random Forest Regression R-squared: {r2_random_forest}")

r2_scores = {
    'Linear Regression': r2_linear,
    'Decision Tree Regression': r2_decision_tree,
    'Random Forest Regression': r2_random_forest
}

algorithm_with_least_r2 = min(r2_scores, key=r2_scores.get)
print(f"\nThe algorithm with the least R-squared value is: {algorithm_with_least_r2}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['ride_week_day'] = data_cleaned['pickup_datetime'].dt.dayofweek
