In [52]:
import pandas as pd
import datetime as dt
import csv

In [53]:
# trial with 10
data = pd.read_csv('../data/raw/journey.csv')
data.shape

(354634, 11)

In [54]:
# remove rows with start and end stations overlapping.
data = data[data['Start station number'] != data['End station number']]
data.shape

(346130, 11)

In [55]:
# trial with 1000
data10 = data.head(1000).copy()

In [56]:
data10_dates = pd.to_datetime(data10['Start date'])

today_date = dt.datetime.today()
today_day = today_date.weekday()

def next_date_same_weekday(original_date):
    target_weekday = original_date.weekday()
    days_ahead = (target_weekday - today_day + 7) % 7
    new_date = today_date + dt.timedelta(days=days_ahead)
    return new_date.strftime('%Y%m%d')


data10['New Start date'] = data10_dates.apply(next_date_same_weekday)
data10['Start time'] = data10_dates.dt.strftime('%H%M')
data10.head()

Unnamed: 0,Number,Start date,Start station number,Start station,End date,End station number,End station,Bike number,Bike model,Total duration,Total duration (ms),New Start date,Start time
0,146028038,2025-01-31 23:59,1142,"Tooley Street, Bermondsey",2025-02-01 00:35,988,"Great Russell Street, Bloomsbury",22507,CLASSIC,36m 8s,2168663,20250523,2359
1,146028039,2025-01-31 23:59,1142,"Tooley Street, Bermondsey",2025-02-01 00:35,988,"Great Russell Street, Bloomsbury",22066,CLASSIC,36m 14s,2174536,20250523,2359
2,146028040,2025-01-31 23:59,300061,"Crimscott Street, Bermondsey",2025-02-01 00:19,300024,"Dunston Road , Haggerston",53875,CLASSIC,20m 11s,1211944,20250523,2359
3,146028041,2025-01-31 23:59,300058,"The Vale, Chelsea",2025-02-01 00:03,300076,"Lots Road, West Chelsea",54808,CLASSIC,4m 32s,272960,20250523,2359
4,146028042,2025-02-01 00:00,300038,"Star Road, West Kensington",2025-02-01 00:12,300037,"Ravenscourt Park Station, Hammersmith",51308,CLASSIC,12m 54s,774703,20250517,0


In [57]:
# remaking dates
data10 = data10.drop(['Number', 'Bike number', 'Total duration', 'End date', 'Start station', 'End station'], axis=1)
data10['Total duration (ms)'] = pd.to_numeric(data10['Total duration (ms)']) // (1000 * 60)
data10 = data10.rename(columns={'Total duration (ms)': 'Total duration (m)'})
data10.head()

Unnamed: 0,Start date,Start station number,End station number,Bike model,Total duration (m),New Start date,Start time
0,2025-01-31 23:59,1142,988,CLASSIC,36,20250523,2359
1,2025-01-31 23:59,1142,988,CLASSIC,36,20250523,2359
2,2025-01-31 23:59,300061,300024,CLASSIC,20,20250523,2359
3,2025-01-31 23:59,300058,300076,CLASSIC,4,20250523,2359
4,2025-02-01 00:00,300038,300037,CLASSIC,12,20250517,0


In [58]:
stations_data = pd.read_xml('../data/raw/livecyclehireupdates.xml')
stations_data['station_coordinates'] = stations_data['lat'].astype(str) + ',' + stations_data['long'].astype(str)
stations_data = stations_data.drop(['id', 'name', 'lat', 'long', 'installed', 'locked', 'installDate', 'removalDate', 'temporary', 'nbBikes', 'nbStandardBikes', 'nbEBikes', 'nbEmptyDocks', 'nbDocks'], axis=1)
stations_data.head()

Unnamed: 0,terminalName,station_coordinates
0,1023,"51.52916347,-0.109970527"
1,1018,"51.49960695,-0.197574246"
2,1012,"51.52128377,-0.084605692"
3,1013,"51.53005939,-0.120973687"
4,3420,"51.49313,-0.156876"


In [59]:
merged_data10 = data10.merge(
    stations_data,
    left_on='Start station number',
    right_on='terminalName'
)
merged_data10 = merged_data10.rename(columns={'station_coordinates': 'start_coordinates'})

merged_data10 = merged_data10.merge(
    stations_data,
    left_on='End station number',
    right_on='terminalName'
)
merged_data10 = merged_data10.rename(columns={'station_coordinates': 'end_coordinates'})

merged_data10 = merged_data10.drop(['terminalName_x', 'terminalName_y'], axis=1)
merged_data10

Unnamed: 0,Start date,Start station number,End station number,Bike model,Total duration (m),New Start date,Start time,start_coordinates,end_coordinates
0,2025-01-31 23:59,1142,988,CLASSIC,36,20250523,2359,"51.5034938,-0.07962099","51.51772703,-0.127854211"
1,2025-01-31 23:59,1142,988,CLASSIC,36,20250523,2359,"51.5034938,-0.07962099","51.51772703,-0.127854211"
2,2025-01-31 23:59,300061,300024,CLASSIC,20,20250523,2359,"51.495598,-0.078893","51.53658514,-0.075885686"
3,2025-01-31 23:59,300058,300076,CLASSIC,4,20250523,2359,"51.48512191,-0.174971902","51.4795738,-0.17903854"
4,2025-02-01 00:00,300038,300037,CLASSIC,12,20250517,0000,"51.48724429,-0.205279052","51.49422354,-0.236769936"
...,...,...,...,...,...,...,...,...,...
982,2025-01-31 21:47,1006,3439,CLASSIC,8,20250523,2147,"51.51863,-0.1326763000000141","51.5262503,-0.123509611"
983,2025-01-31 21:47,300031,200017,CLASSIC,41,20250523,2147,"51.53638435,-0.102757578","51.511654,-0.179668"
984,2025-01-31 21:45,1225,300097,CLASSIC,21,20250523,2145,"51.517703,-0.154106","51.491026,-0.209121"
985,2025-01-31 21:45,972,300235,CLASSIC,12,20250523,2145,"51.52554222,-0.138231303","51.541596,-0.12544145"


In [60]:
merged_data10.to_csv('../data/processed/processed1000.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)