In [1]:
import numpy as np
import pandas as pd

In [2]:
#  Load the data from csv fole to Pandas Dataframe
lyft_data = pd.read_csv('/content/drive/MyDrive/data/ride_lyft.csv')

In [3]:
# Display the first 5 rows of the DataFrame
lyft_data.head()

Unnamed: 0,id,timestamp,hour,day,month,datetime,timezone,source,destination,cab_type,...,precipIntensityMax,uvIndexTime,temperatureMin,temperatureMinTime,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime
0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,1544953000.0,9,16,12,2018-12-16 09:30:07,America/New_York,Haymarket Square,North Station,Lyft,...,0.1276,1544979600,39.89,1545012000,43.68,1544968800,33.73,1545012000,38.07,1544958000
1,4bd23055-6827-41c6-b23b-3c491f24e74d,1543284000.0,2,27,11,2018-11-27 02:00:23,America/New_York,Haymarket Square,North Station,Lyft,...,0.13,1543251600,40.49,1543233600,47.3,1543251600,36.2,1543291200,43.92,1543251600
2,981a3613-77af-4620-a42a-0c0866077d1e,1543367000.0,1,28,11,2018-11-28 01:00:22,America/New_York,Haymarket Square,North Station,Lyft,...,0.1064,1543338000,35.36,1543377600,47.55,1543320000,31.04,1543377600,44.12,1543320000
3,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,1543554000.0,4,30,11,2018-11-30 04:53:02,America/New_York,Haymarket Square,North Station,Lyft,...,0.0,1543507200,34.67,1543550400,45.03,1543510800,30.3,1543550400,38.53,1543510800
4,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,1543463000.0,3,29,11,2018-11-29 03:49:20,America/New_York,Haymarket Square,North Station,Lyft,...,0.0001,1543420800,33.1,1543402800,42.18,1543420800,29.11,1543392000,35.75,1543420800


In [4]:
# Numbers of rows and columns
lyft_data.shape

(307408, 57)

In [5]:
# Convert 'datetime' column to datetime type
lyft_data['datetime'] = pd.to_datetime(lyft_data['datetime'])

In [None]:
# List of unnecessary columns to drop
drop_cols = [
    'id',	'timestamp', 'timezone', 'cab_type', 'product_id', 'temperatureHigh' ,'temperatureHighTime',
    'temperatureLow' ,'temperatureLowTime','apparentTemperatureHigh','apparentTemperatureHighTime',
    'apparentTemperatureLow' ,'apparentTemperatureLowTime' ,'temperatureMin', 'temperatureMinTime',
    'temperatureMax', 'temperatureMaxTime' ,'apparentTemperatureMin' , 'apparentTemperatureMinTime' ,
    'apparentTemperatureMax' ,'apparentTemperatureMaxTime', 'apparentTemperature','windGust' ,'windGustTime',
   'visibility.1',	'sunsetTime','moonPhase','sunriseTime',	'precipIntensityMax',	'ozone','precipProbability',
   'long_summary', 'icon', 'dewPoint','pressure' ,'windBearing' ,'cloudCover','price', 'distance', 'surge_multiplier',
    'temperature', 'short_summary', 'precipIntensity', 'humidity', 'windSpeed', 'visibility', 'uvIndex', 'uvIndexTime',
]

# Drop the unnecessary columns
snapp_data_clean = lyft_data.drop(drop_cols, axis=1)

In [None]:
# Show unique values in 'name' column (service types)
snapp_data_clean['name'].unique()

array(['Shared', 'Lux', 'Lyft', 'Lux Black XL', 'Lyft XL', 'Lux Black'],
      dtype=object)

In [None]:
# Map original service names to simplified categories
mapping = {
    'Shared':'Sharing',
    'Lux':'Eco',
    'Lyft':'Eco',
    'Lux Black XL':'Eco Plus',
    'Lyft XL':'Eco Plus',
    'Lux Black':'Eco Plus',
}

# Apply mapping to create 'service_type'
snapp_data_clean['service_type'] = snapp_data_clean['name'].map(mapping)


In [None]:
# Drop the original 'name' column
snapp_data_clean = snapp_data_clean.drop('name', axis=1)

In [None]:
# Replace all value in 'destination' with a constant value
snapp_data_clean['destination']= 'Poonak Azad University'

In [None]:
# Check the min and max of datetime
snapp_data_clean['datetime'].agg(['min', 'max'])

Unnamed: 0,datetime
min,2018-11-26 03:40:46
max,2018-12-18 19:15:10


In [None]:
# Store old min and max datetime for scaling
old_min = snapp_data_clean['datetime'].min()
old_max = snapp_data_clean['datetime'].max()

# Define new min and max datetime for scaling
new_min = pd.Timestamp('2024-09-22 07:00:00')
new_max = pd.Timestamp('2025-03-18 22:00:59')

# Scale datetime to new range
snapp_data_clean['datetime_scaled'] = (
    (snapp_data_clean['datetime'] - old_min) /
     (old_max -old_min) ) * (new_max - new_min) + new_min


In [None]:
# Verify the scaled datetime min and max
snapp_data_clean['datetime_scaled'].agg(['min', 'max'])

Unnamed: 0,datetime_scaled
min,2024-09-22 07:00:00
max,2025-03-18 22:00:59


In [None]:
# Drop original datetime and other redundant columns
snapp_data_clean = snapp_data_clean.drop(['datetime', 'day', 'hour', 'month'], axis=1)


In [None]:
# Function to adjust weekends to previous day (Thursday=3, Friday=4)
def remove_weekend(date):
  day = date.dayofweek
  if day==3:
    return date - pd.Timedelta(days=1)
  elif day==4:
    return date - pd.Timedelta(days=1)
  else:
    return date

# Apply weekend adjustment
snapp_data_clean['datetime'] = snapp_data_clean['datetime_scaled'].apply(remove_weekend)

In [None]:
# Drop the temporary scaled datetime column
snapp_data_clean = snapp_data_clean.drop(['datetime_scaled'], axis=1)

In [None]:
# Extract hour, day, month from datetime
snapp_data_clean['hour'] = snapp_data_clean['datetime'].dt.hour
snapp_data_clean['day'] = snapp_data_clean['datetime'].dt.day
snapp_data_clean['month'] = snapp_data_clean['datetime'].dt.month

In [None]:
# Function to assign part of day based on hour
def assign_part_of_day(hour):
  if 6 <= hour <=11:
    return 'morning'
  elif 12 <= hour <=14:
    return 'noon'
  elif 15 <= hour <=19:
    return 'evening'
  else:
    return 'night'

# Apply part of day assignment
snapp_data_clean['part_of_day'] = snapp_data_clean['hour'].apply(assign_part_of_day)

In [None]:
snapp_data_clean.head()

Unnamed: 0,source,destination,latitude,longitude,service_type,datetime,hour,day,month,part_of_day
0,Haymarket Square,Poonak Azad University,42.2148,-71.033,Sharing,2025-02-27 01:06:03,1,27,2,night
1,Haymarket Square,Poonak Azad University,42.2148,-71.033,Eco,2024-09-29 14:06:02,14,29,9,noon
2,Haymarket Square,Poonak Azad University,42.2148,-71.033,Eco,2024-10-07 02:28:40,2,7,10,night
3,Haymarket Square,Poonak Azad University,42.2148,-71.033,Eco Plus,2024-10-23 01:20:00,1,23,10,night
4,Haymarket Square,Poonak Azad University,42.2148,-71.033,Eco Plus,2024-10-15 20:47:07,20,15,10,night


In [None]:
# Create binary column indicating peak hours
snapp_data_clean['peak_hour'] = snapp_data_clean['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)

In [None]:
# Show unique sources
snapp_data_clean['source'].unique()

array(['Haymarket Square', 'Back Bay', 'North Station', 'Fenway',
       'Theatre District', 'Beacon Hill', 'Financial District',
       'North End', 'Northeastern University', 'Boston University',
       'West End', 'South Station'], dtype=object)

In [None]:
# Tehran district coordinates
tehran_coords = {
    1: (35.804, 51.429),2: (35.744, 51.365), 3: (35.761, 51.448),
    4: (35.765, 51.518), 5: (35.741, 51.310), 6: (35.727, 51.415),
    7: (35.729, 51.482), 8: (35.733, 51.561), 9: (35.688, 51.310),
    10: (35.685, 51.385),11: (35.676, 51.421),12: (35.672, 51.439),
    13: (35.702, 51.513), 14: (35.686, 51.499), 15: (35.639, 51.499),
    16: (35.646, 51.443), 17: (35.662, 51.363), 18: (35.637, 51.306),
    19: (35.613, 51.348), 20: (35.600, 51.420), 21: (35.688, 51.230),
    22: (35.747, 51.206)
}

# Create list of region names
regions = [f'District{i}' for i in range(1, 23)]

# Randomly assign sources to each row
np.random.seed(42)
snapp_data_clean['source'] = np.random.choice(regions, size=len(snapp_data_clean), replace=True)

# Extract numeric district from source
snapp_data_clean['region_num']= snapp_data_clean['source'].str.extract('(\d+)').astype(int)

# Map latitude and longitude using region numbers
snapp_data_clean['latitude'] = snapp_data_clean['region_num'].map(lambda x: tehran_coords[x][0])
snapp_data_clean['longitude'] = snapp_data_clean['region_num'].map(lambda x: tehran_coords[x][1])

  lyft_data_clean['region_num']= lyft_data_clean['source'].str.extract('(\d+)').astype(int)


In [None]:
# Drop temporary numeric region column
snapp_data_clean = snapp_data_clean.drop('region_num', axis=1)

In [None]:
snapp_data_clean.head()

Unnamed: 0,source,destination,latitude,longitude,service_type,datetime,hour,day,month,part_of_day,peak_hour
0,District7,Poonak Azad University,35.729,51.482,Sharing,2025-02-27 01:06:03,1,27,2,night,0
1,District20,Poonak Azad University,35.6,51.42,Eco,2024-09-29 14:06:02,14,29,9,noon,0
2,District15,Poonak Azad University,35.639,51.499,Eco,2024-10-07 02:28:40,2,7,10,night,0
3,District11,Poonak Azad University,35.676,51.421,Eco Plus,2024-10-23 01:20:00,1,23,10,night,0
4,District8,Poonak Azad University,35.733,51.561,Eco Plus,2024-10-15 20:47:07,20,15,10,night,0


In [None]:
# Create route column combining source and destination
snapp_data_clean['route'] = snapp_data_clean['source'] + '->' + snapp_data_clean['destination']

# Count number of trips per route per day
snapp_data_clean['route_day_count'] = snapp_data_clean.groupby(['route', 'day'])['route'].transform('count')


In [None]:
# Compute mean hour per source
snapp_data_clean['mean_hour_by_source']= snapp_data_clean.groupby('source')['hour'].transform('mean')

In [None]:
# Count service type occurrences per day
snapp_data_clean['service_type_day_count']= snapp_data_clean.groupby(['service_type', 'day'])['service_type'].transform('count')

# Extract weekday from datetime
snapp_data_clean['weekday'] = snapp_data_clean['datetime'].dt.weekday

# Count service type occurrences per weekday
snapp_data_clean['service_type_weekday_count']= snapp_data_clean.groupby(['service_type', 'weekday'])['service_type'].transform('count')

In [None]:
snapp_data_clean.head()

Unnamed: 0,source,destination,latitude,longitude,service_type,datetime,hour,day,month,part_of_day,peak_hour,route,route_day_count,mean_hour_by_source,service_type_day_count,weekday,service_type_weekday_count
0,District7,Poonak Azad University,35.729,51.482,Sharing,2025-02-27 01:06:03,1,27,2,night,0,District7->Poonak Azad University,223,11.571418,864,3,7302
1,District20,Poonak Azad University,35.6,51.42,Eco,2024-09-29 14:06:02,14,29,9,noon,0,District20->Poonak Azad University,313,11.663164,2409,6,15302
2,District15,Poonak Azad University,35.639,51.499,Eco,2024-10-07 02:28:40,2,7,10,night,0,District15->Poonak Azad University,130,11.556827,1055,0,14962
3,District11,Poonak Azad University,35.676,51.421,Eco Plus,2024-10-23 01:20:00,1,23,10,night,0,District11->Poonak Azad University,551,11.542119,5653,2,42300
4,District8,Poonak Azad University,35.733,51.561,Eco Plus,2024-10-15 20:47:07,20,15,10,night,0,District8->Poonak Azad University,571,11.492052,5891,1,21643


In [None]:
# Save final cleaned DataFrame to CSV
snapp_data_clean.to_csv('/content/drive/MyDrive/data/snapp_data_clean.csv', index=False)