24h data fix 

In [3]:
import pandas as pd

# 定义函数来处理每季度数据
def process_trip_data(filename):
    # 1. 读取数据
    df = pd.read_csv(filename)

    # 2. 转换时间格式到小时级别
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    df['start_hour'] = df['start_time'].dt.floor('H')
    df['end_hour'] = df['end_time'].dt.floor('H')

    # 3. 统计每个站点每小时的离站数量
    outbound_counts = df.groupby(['from_station_id', 'start_hour']).size().reset_index(name='outbound_count')

    # 4. 统计每个站点每小时的到站数量
    inbound_counts = df.groupby(['to_station_id', 'end_hour']).size().reset_index(name='inbound_count')

    # 5. 重命名列以便合并
    outbound_counts = outbound_counts.rename(columns={'from_station_id': 'station_id', 'start_hour': 'hour'})
    inbound_counts = inbound_counts.rename(columns={'to_station_id': 'station_id', 'end_hour': 'hour'})

    # 6. 合并离站和到站数据
    hourly_counts = pd.merge(outbound_counts, inbound_counts, on=['station_id', 'hour'], how='outer').fillna(0)

    # 7. 转换出站和入站计数为整数
    hourly_counts['outbound_count'] = hourly_counts['outbound_count'].astype(int)
    hourly_counts['inbound_count'] = hourly_counts['inbound_count'].astype(int)
    
    return hourly_counts

# 处理Q1和Q2数据
q1_data = process_trip_data('Divvy_Trips_2019_Q1.csv')
q2_data = process_trip_data('Divvy_Trips_2019_Q2.csv')
q3_data = process_trip_data('Divvy_Trips_2019_Q3.csv')
q4_data = process_trip_data('Divvy_Trips_2019_Q4.csv')

# 保存结果
q1_data.to_csv('Q1_station_hourly_counts.csv', index=False)
q2_data.to_csv('Q2_station_hourly_counts.csv', index=False)
q3_data.to_csv('Q3_station_hourly_counts.csv', index=False)
q4_data.to_csv('Q4_station_hourly_counts.csv', index=False)

print("Q1 - Q4 的各站点每小时的进出情况已分别保存至 'Q1-Q4_station_hourly_counts.csv'")



Q1 - Q4 的各站点每小时的进出情况已分别保存至 'Q1-Q4_station_hourly_counts.csv'


In [4]:
import pandas as pd
import glob

# 定义一个函数来处理每个季度的数据
def process_trip_data(filename):
    df = pd.read_csv(filename)
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])

    # 计算每个站点的出站数量
    outbound_counts = df.groupby('from_station_id').size().reset_index(name='outbound_trips')
    
    # 计算每个站点的入站数量
    inbound_counts = df.groupby('to_station_id').size().reset_index(name='inbound_trips')

    # 合并出站和入站数据
    station_counts = pd.merge(outbound_counts, inbound_counts, left_on='from_station_id', right_on='to_station_id', how='outer')
    station_counts['outbound_trips'] = station_counts['outbound_trips'].fillna(0).astype(int)
    station_counts['inbound_trips'] = station_counts['inbound_trips'].fillna(0).astype(int)
    station_counts['difference_trips'] = abs(station_counts['outbound_trips'] - station_counts['inbound_trips'])
    
    # 创建统一的 station_id 列
    station_counts['station_id'] = station_counts['from_station_id'].combine_first(station_counts['to_station_id'])

    # 选择所需的列
    station_counts = station_counts[['station_id', 'outbound_trips', 'inbound_trips', 'difference_trips']]
    
    return station_counts

# 读取 Q1 到 Q4 的数据文件
all_data = pd.DataFrame()
files = ['Divvy_Trips_2019_Q1.csv', 'Divvy_Trips_2019_Q2.csv', 'Divvy_Trips_2019_Q3.csv', 'Divvy_Trips_2019_Q4.csv']

# 合并四个季度的数据
for file in files:
    quarter_data = process_trip_data(file)
    all_data = pd.concat([all_data, quarter_data])

# 按 station_id 汇总所有季度的出站和入站数据
total_counts = all_data.groupby('station_id').sum().reset_index()

# 找出进出站差值最多的站点
max_difference_station = total_counts.loc[total_counts['difference_trips'].idxmax()]

# 输出结果
print("进出站差值最多的站点：")
print(max_difference_station)

# 保存最终结果（可选）
total_counts.to_csv('annual_station_trip_counts.csv', index=False)


进出站差值最多的站点：
station_id            195.0
outbound_trips      39192.0
inbound_trips       24069.0
difference_trips    15123.0
Name: 177, dtype: float64
