In [195]:
import pandas as pd

In [196]:
# Loading Data and Dropping Null Values

january_data = pd.read_csv('Data/January/202401-divvy-tripdata.csv')

january_data.dropna(inplace=True)   
january_data.reset_index(drop=True, inplace=True)
# print(january_data.info())


print((january_data['start_station_name' ].value_counts(normalize=True)*100).head(10))

start_station_name
Clinton St & Washington Blvd    1.222234
Kingsbury St & Kinzie St        1.202024
University Ave & 57th St        1.113279
Clinton St & Madison St         1.074617
Ellis Ave & 60th St             1.014867
Ellis Ave & 55th St             0.864614
Clark St & Elm St               0.831225
Larrabee St & Kingsbury St      0.803986
Canal St & Adams St             0.784655
Clinton St & Jackson Blvd       0.774111
Name: proportion, dtype: float64


In [197]:
# Taking a random sample of 10% of the data

january_sample = january_data.sample(frac=0.1, random_state=42)
january_sample.reset_index(drop=True, inplace=True) 
# print(january_sample.info())

print((january_sample['start_station_name' ].value_counts(normalize=True)*100).head(10))

start_station_name
Clinton St & Washington Blvd    1.203761
Kingsbury St & Kinzie St        1.107108
Clinton St & Madison St         1.028029
University Ave & 57th St        1.019243
Clark St & Elm St               0.896231
Clinton St & Jackson Blvd       0.887444
Ellis Ave & 60th St             0.878657
Canal St & Adams St             0.808365
Canal St & Madison St           0.790792
Ellis Ave & 55th St             0.746859
Name: proportion, dtype: float64


In [198]:
# Encode 'rideable_type' and 'member_casual' as categorical codes

january_sample['rideable_type_code'] = january_sample['rideable_type'].astype('category').cat.codes
january_sample['member_casual_code'] = january_sample['member_casual'].astype('category').cat.codes

In [199]:
print(january_sample.columns)

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'rideable_type_code', 'member_casual_code'],
      dtype='object')


In [200]:
# Convert started_at and ended_at to datetime
january_sample['started_at'] = pd.to_datetime(january_sample['started_at'], format='%Y-%m-%d %H:%M:%S')
january_sample['ended_at'] = pd.to_datetime(january_sample['ended_at'], format='%Y-%m-%d %H:%M:%S')

# Ensure latitude and longitude columns are float
lat_lng_cols = ['start_lat', 'start_lng', 'end_lat', 'end_lng']
january_sample[lat_lng_cols] = january_sample[lat_lng_cols].astype(float)

# Ensure station ID columns are string
station_id_cols = ['start_station_id', 'end_station_id']
january_sample[station_id_cols] = january_sample[station_id_cols].astype(str)

In [201]:
january_sample['ride_length'] = (january_sample['ended_at'] - january_sample['started_at']).dt.total_seconds() / 60 
january_sample['hour_of_day'] = january_sample['started_at'].dt.strftime('%I %p')
january_sample['day_of_week'] = january_sample['started_at'].dt.day_name()

In [202]:
# Remove rides with negative or excessively long ride lengths
january_sample = january_sample[(january_sample['ride_length'] >= 0) & 
                                (january_sample['ride_length'] <= 1440)]

In [None]:
# Filter to Chicago area based on lat/lng bounds
january_sample = january_sample[(january_sample['start_lat'].between(41.6, 42.1)) &
                                (january_sample['start_lng'].between(-88.0, -87.5)) &
                                (january_sample['end_lat'].between(41.6, 42.1)) &
                                (january_sample['end_lng'].between(-88.0, -87.5))]  


11381


In [None]:
# Encode 'rideable_type' and 'member_casual' as categorical codes
reordered_columns = ['ride_id',
 'rideable_type', 'rideable_type_code',
 'started_at', 'ended_at',
 'ride_length', 'day_of_week', 'hour_of_day',
 'start_station_name', 'start_station_id', 'start_lat', 'start_lng',
 'end_station_name', 'end_station_id', 'end_lat', 'end_lng',
 'member_casual', 'member_casual_code']

january_sample = january_sample[reordered_columns]

print(january_sample.head())

            ride_id rideable_type  rideable_type_code          started_at  \
0  C6EA0F39B80D51E9  classic_bike                   0 2024-01-02 18:27:09   
1  940E4EFE3068BCE5  classic_bike                   0 2024-01-25 08:01:10   
2  07AF79F9C2A94365  classic_bike                   0 2024-01-16 16:55:34   
3  76F9ABE3E712308D  classic_bike                   0 2024-01-02 08:46:33   
4  C322CB3D4C9BD026  classic_bike                   0 2024-01-23 14:50:34   

             ended_at  ride_length day_of_week hour_of_day  \
0 2024-01-02 18:52:02    24.883333     Tuesday       06 PM   
1 2024-01-25 08:19:46    18.600000    Thursday       08 AM   
2 2024-01-16 17:10:45    15.183333     Tuesday       04 PM   
3 2024-01-02 08:51:58     5.416667     Tuesday       08 AM   
4 2024-01-23 15:08:10    17.600000     Tuesday       02 PM   

            start_station_name start_station_id  start_lat  start_lng  \
0        Shields Ave & 31st St     KA1503000038  41.838464 -87.635406   
1        Emerald A

In [210]:
# Converting the cleaned sample to a CSV file
january_sample.to_csv('Data/January/Divvy_cleaned_January_2024.csv')