**UBER DATA ANALYSIS AND ML MODEL**

In [53]:
import pandas as pd
import numpy as np

df=pd.read_csv('uber_rides.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


In [7]:
# SHAPE OF THE DATASET
df.shape

(200000, 8)

In [9]:
# NULL VALUES IN THE DATASET
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [12]:
# DATATYPES
df.dtypes

ride_id                            int64
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

In [11]:
# CONVERTING THE pickup_datetime TO DATATIME FORMAT
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'])


In [16]:
# DROP THE NULL VALUES
df=df.dropna()


In [17]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [18]:
# AVERAGE PF FARE AMOUNT
df['fare_amount'].mean()

11.359891549457748

In [19]:
import numpy as np

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of the Earth in kilometers
    return c * r


In [20]:
df['haversine_distance'] = df.apply(lambda row: haversine_distance(
    row['pickup_latitude'],
    row['pickup_longitude'],
    row['dropoff_latitude'],
    row['dropoff_longitude']
), axis=1)


In [21]:
median_haversine_distance=df['haversine_distance'].median()


In [22]:
median_haversine_distance

2.1209923961833708

In [23]:
max_haversine_distance=df['haversine_distance'].max()


In [24]:
max_haversine_distance

16409.239135313164

In [25]:
(df['haversine_distance']==0.0).sum()


5632

In [26]:
#'haversine_distance' is 0.0
zero_distance_rides=df[df['haversine_distance']==0.0]

# the mean 'fare_amount' for those rides
mean_fare_for_zero_distance_rides=zero_distance_rides['fare_amount'].mean()


In [27]:
mean_fare_for_zero_distance_rides

11.585317826704546

In [28]:
df['fare_amount'].max()

499.0

In [29]:
# Find the ride with the highest 'fare_amount'
costliest_ride=df[df['fare_amount']==df['fare_amount'].max()]

# Calculate the Haversine distance for the costliest ride
haversine_distance_costliest_ride = haversine_distance(
    costliest_ride['pickup_latitude'].values[0],
    costliest_ride['pickup_longitude'].values[0],
    costliest_ride['dropoff_latitude'].values[0],
    costliest_ride['dropoff_longitude'].values[0]
)


In [30]:
haversine_distance_costliest_ride

0.0007899213191009993

In [55]:
# Convert 'pickup_datetime' to a datetime data type if not already done
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'])

# the year from the 'pickup_datetime' column
df['pickup_year']=df['pickup_datetime'].dt.year

# Count the number of rides that occurred in 2014
rides_in_2014=(df['pickup_year']==2014).sum()


In [56]:
rides_in_2014

29968

In [57]:

# the first quarter of 2014 (January 1, 2014, to March 31, 2014)
first_quarter_rides=df[(df['pickup_datetime'].dt.month >= 1)&(df['pickup_datetime'].dt.month<=3)]

# Count the number of rides in the first quarter of 2014
rides_in_first_quarter_2014=len(first_quarter_rides)


In [58]:
rides_in_first_quarter_2014

53126

In [59]:

# Extract the day of the week and year
df['day_of_week']=df['pickup_datetime'].dt.day_name()
df['year']=df['pickup_datetime'].dt.year

#  September 2010
september_2010_rides = df[(df['year'] == 2010) & (df['pickup_datetime'].dt.month == 9)]

# Group by day of the week and count rides
rides_by_day_of_week = september_2010_rides['day_of_week'].value_counts()

# the day with the maximum number of rides
max_rides_day = rides_by_day_of_week.idxmax()
max_rides_count = rides_by_day_of_week.max()

print(f"The day with the maximum rides in September 2010 was {max_rides_day} with {max_rides_count} rides.")


The day with the maximum rides in September 2010 was Thursday with 457 rides.


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199999 entries, 0 to 199999
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   ride_id             199999 non-null  int64              
 1   fare_amount         199999 non-null  float64            
 2   pickup_datetime     199999 non-null  datetime64[ns, UTC]
 3   pickup_longitude    199999 non-null  float64            
 4   pickup_latitude     199999 non-null  float64            
 5   dropoff_longitude   199999 non-null  float64            
 6   dropoff_latitude    199999 non-null  float64            
 7   passenger_count     199999 non-null  int64              
 8   haversine_distance  199999 non-null  float64            
 9   pickup_year         199999 non-null  int64              
 10  day_of_week         199999 non-null  object             
 11  year                199999 non-null  int64              
dtypes: datetime64[ns

MACHINE LEARNING MODEL

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

# Extract the day of the week and year
df['day_of_week'] = df['pickup_datetime'].dt.day_name()
df['year'] = df['pickup_datetime'].dt.year

# One-hot encode 'ride_week_day'
df = pd.get_dummies(df, columns=['day_of_week'], drop_first=True)

# Split the data into training and testing sets (70-30 split)
X = df[['passenger_count', 'haversine_distance'] + [col for col in df.columns if col.startswith('day_of_week')]]
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and fit the regression models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'K-Nearest Neighbors Regressor': KNeighborsRegressor()
}

adjusted_r2_scores = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    # Calculate adjusted R-squared
    n = len(y_test)
    p = X_test.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    adjusted_r2_scores[model_name] = adjusted_r2

# Find the algorithm with the least adjusted R-squared value
worst_algorithm = min(adjusted_r2_scores, key=adjusted_r2_scores.get)
worst_score = adjusted_r2_scores[worst_algorithm]

print("Adjusted R-squared scores:")
for model_name, score in adjusted_r2_scores.items():
    print(f"{model_name}: {score}")

print(f"The algorithm with the least adjusted R-squared value is {worst_algorithm} with a score of {worst_score}.")


Adjusted R-squared scores:
Linear Regression: 0.00031314876430676986
Decision Tree Regressor: 0.4770868871423056
Random Forest Regressor: 0.6289926834446569
K-Nearest Neighbors Regressor: 0.6335937119601454
The algorithm with the least adjusted R-squared value is Linear Regression with a score of 0.00031314876430676986.
