In [1]:
import pandas as pd
import math


df = pd.read_csv('JC-202207-citbike-tripdata.csv')
df2= pd.read_csv('JC-202307-citibike-tripdata.csv')
df4= pd.read_csv('JC-202212-citibike-tripdata.csv')

In [2]:
df3 = pd.concat([df, df2,df4], axis=0)


In [3]:

# Convert 'started_at' and 'ended_at' to datetime format
df3['started_at'] = pd.to_datetime(df3['started_at'])
df3['ended_at'] = pd.to_datetime(df3['ended_at'])

# Calculate the duration in minutes
df3['duration_minutes'] = (df3['ended_at'] - df3['started_at']).dt.total_seconds() / 60.0

In [4]:
df3['time_of_day'] = df3['started_at'].dt.hour + df3['started_at'].dt.minute / 60.0


In [5]:

def haversine_distance(lat1, lon1, lat2, lon2):
    # Radius of Earth in kilometers
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    # Compute differences in latitude and longitude
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    # Haversine formula
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    
    # Distance in kilometers
    distance = R * c
    
    return distance


In [6]:

# Calculate distance using the haversine formula
df3['distance_km'] = df3.apply(lambda row: haversine_distance(row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng']), axis=1)




In [7]:


# Function to classify durations
def classify_duration(duration):
    if duration <= 20:
        return '0-20'
    elif 20 < duration <= 40:
        return '20-40'
    elif 40 < duration <= 60:
        return '40-60'
    elif 60 < duration <= 80:
        return '60-80'
    elif 80 < duration <= 100:
        return '80-100'
    elif 100 < duration <= 120:
        return '100-120'
    else:
        return 'larger than 120'

# Add new column with classified durations
df3['duration_class'] = df3['duration_minutes'].map(lambda x: classify_duration(x))




In [8]:

# Function to classify distances
def classify_distance(distance):
    if distance <= 0.5:
        return '0 to 0.5'
    elif 0.5 < distance <= 1:
        return '0.5 to 1'
    elif 1 < distance <= 1.5:
        return '1 to 1.5'
    elif 1.5 < distance <= 2:
        return '1.5 to 2'
    elif 2 < distance <= 4:
        return '2 to 4'
    elif 4 < distance <= 8:
        return '4 to 8'
    elif 8 < distance <= 12:
        return '8 to 12'
    else:
        return 'longer'

# Add new column with classified distances
df3['distance_class_'] = df3['distance_km'].map(lambda x: classify_distance(x))



In [9]:
df3.to_csv('bike.csv', index = False)

In [10]:
df3.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration_minutes,time_of_day,distance_km,duration_class,distance_class_
0,89CDCE3224502904,classic_bike,2022-07-21 18:19:45,2022-07-21 18:28:21,Marshall St & 2 St,HB408,8 St & Washington St,HB603,40.740802,-74.042521,40.745984,-74.028199,member,8.6,18.316667,1.337089,0-20,1 to 1.5
1,3E6E50F51BD4CBD9,electric_bike,2022-07-21 17:50:57,2022-07-21 17:59:17,Marshall St & 2 St,HB408,11 St & Washington St,HB502,40.740802,-74.042521,40.749985,-74.02715,casual,8.333333,17.833333,1.649043,0-20,1.5 to 2
2,8A9058FAA9115562,electric_bike,2022-07-23 11:51:55,2022-07-23 11:59:16,Marshall St & 2 St,HB408,Oakland Ave,JC022,40.740802,-74.042521,40.737604,-74.052478,member,7.35,11.85,0.911181,0-20,0.5 to 1
3,C566A187663748AE,classic_bike,2022-07-30 20:34:40,2022-07-30 20:48:40,Marshall St & 2 St,HB408,Bloomfield St & 15 St,HB203,40.740802,-74.042521,40.75453,-74.02658,member,14.0,20.566667,2.033093,0-20,2 to 4
4,7E754D57276BE57E,classic_bike,2022-07-14 18:42:34,2022-07-14 18:49:45,Marshall St & 2 St,HB408,Clinton St & 7 St,HB303,40.740802,-74.042521,40.74542,-74.03332,casual,7.183333,18.7,0.929804,0-20,0.5 to 1


In [11]:
df3.duration_minutes.mean()

16.68112036001188

In [12]:
df3.describe()

Unnamed: 0,start_lat,start_lng,end_lat,end_lng,duration_minutes,time_of_day,distance_km
count,263695.0,263695.0,263170.0,263170.0,263695.0,263695.0,263170.0
mean,40.731908,-74.040257,40.731889,-74.039987,16.68112,14.782121,1.171944
std,0.01221,0.012058,0.012335,0.012039,122.027854,5.261381,0.819117
min,40.705897,-74.087223,40.65,-74.16,-0.016667,0.0,0.0
25%,40.721063,-74.045953,40.719586,-74.045572,4.45,10.883333,0.642223
50%,40.734749,-74.037951,40.734786,-74.037683,7.166667,15.75,1.029544
75%,40.740973,-74.031039,40.740973,-74.03097,12.283333,18.85,1.556463
max,40.761599,-74.016154,40.86448,-73.89522,23956.05,23.983333,20.361788
