In [15]:
import pandas as pd
import glob
import os

In [23]:
# Path to your data directory
data_dir = '/Users/zhuoyuelian/PycharmProjects/CS5800/data/data'

# Get all Citibike CSV files
csv_files = glob.glob(os.path.join(data_dir, '2024*-citibike-tripdata*.csv'))

# Sort the files to process them in chronological order
csv_files.sort()

# Print the files we're going to merge
print(f"Found {len(csv_files)} CSV files to merge:")
for file in csv_files:
    print(f"  - {os.path.basename(file)}")

# Create an empty dataframe to store the combined data
combined_df = None

# Read and combine all CSV files one by one
for file in csv_files:
    try:
        print(f"Reading {os.path.basename(file)}...")
        # Use low_memory=False to handle mixed data types
        df = pd.read_csv(file, low_memory=False)
        print(f"  - Shape: {df.shape}")

        # Print column names to help diagnose issues
        print(f"  - Columns: {df.columns.tolist()}")

        # If this is the first file, use it as the base
        if combined_df is None:
            combined_df = df
            print(f"  - First file set as base with {len(combined_df)} rows")
        else:
            # Ensure columns match by using only common columns
            common_columns = list(set(combined_df.columns) & set(df.columns))
            if len(common_columns) < len(combined_df.columns):
                print(f"  - Warning: Only {len(common_columns)} common columns found out of {len(combined_df.columns)}")

            # Append the current dataframe to the combined one
            combined_df = pd.concat([combined_df[common_columns], df[common_columns]], ignore_index=True)
            print(f"  - Combined size now: {len(combined_df)} rows")

    except Exception as e:
        print(f"Error processing {os.path.basename(file)}: {str(e)}")
        print("Skipping this file and continuing with others")

# If we managed to combine any files
if combined_df is not None:
    # Print the shape of the combined dataframe
    print(f"\nFinal combined dataframe shape: {combined_df.shape}")

    # Save the combined dataframe to a new CSV file
    output_path = 'data/merged/all_citibike_data_2024.csv'
    combined_df.to_csv(output_path, index=False)
    print(f"\nAll data merged and saved to: {output_path}")
else:
    print("No data could be combined. Please check the file formats.")

Found 14 CSV files to merge:
  - 202401-citibike-tripdata.csv
  - 202402-citibike-tripdata.csv
  - 202403-citibike-tripdata.csv
  - 202404-citibike-tripdata.csv
  - 202405-citibike-tripdata_1.csv
  - 202405-citibike-tripdata_2.csv
  - 202405-citibike-tripdata_3.csv
  - 202405-citibike-tripdata_4.csv
  - 202405-citibike-tripdata_5.csv
  - 202406-citibike-tripdata_1.csv
  - 202406-citibike-tripdata_2.csv
  - 202406-citibike-tripdata_3.csv
  - 202406-citibike-tripdata_4.csv
  - 202406-citibike-tripdata_5.csv
Reading 202401-citibike-tripdata.csv...
  - Shape: (1888085, 13)
  - Columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']
  - First file set as base with 1888085 rows
Reading 202402-citibike-tripdata.csv...
  - Shape: (2121501, 13)
  - Columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_stat

In [21]:
print(f"Current working directory: {os.getcwd()}")

Current working directory: /Users/zhuoyuelian/PycharmProjects/CS5800


In [24]:
file_path = 'data/merged/all_citibike_data_2024.csv'
# Specify that these columns should be read as strings
df = pd.read_csv(file_path, dtype={'start_station_id': str, 'end_station_id': str})

In [25]:
df.head()

Unnamed: 0,ended_at,start_station_name,member_casual,end_lng,started_at,start_station_id,end_station_name,start_lat,start_lng,end_lat,end_station_id,ride_id,rideable_type
0,2024-01-22 18:48:10.708,Frederick Douglass Blvd & W 145 St,member,-73.951878,2024-01-22 18:43:19.012,7954.12,St Nicholas Ave & W 126 St,40.823072,-73.941738,40.811432,7756.1,5078F3D302000BD2,electric_bike
1,2024-01-11 19:47:36.007,W 54 St & 6 Ave,member,-73.954823,2024-01-11 19:19:18.721,6771.13,E 74 St & 1 Ave,40.761822,-73.977036,40.768974,6953.08,814337105D37302A,electric_bike
2,2024-01-30 19:32:49.857,E 11 St & Ave B,casual,-74.008515,2024-01-30 19:17:41.693,5659.11,W 10 St & Washington St,40.727592,-73.979751,40.733424,5847.06,A33A920E2B10710C,electric_bike
3,2024-01-27 11:38:01.213,W 54 St & 6 Ave,member,-73.954823,2024-01-27 11:27:01.759,6771.13,E 74 St & 1 Ave,40.761779,-73.977144,40.768974,6953.08,A3A5FC0DD7D34D74,electric_bike
4,2024-01-16 15:29:26.156,Madison Ave & E 99 St,member,-73.954823,2024-01-16 15:15:41.000,7443.01,E 74 St & 1 Ave,40.789808,-73.952214,40.768974,6953.08,6F96728ECEFBDAA4,electric_bike


In [26]:
# Check missing values
print("\nMissing values by column:")
print(df.isnull().sum())

# Filter rows with missing values (if any)
missing_rows = df[df.isnull().any(axis=1)]
print(f"\nNumber of rows with missing values: {len(missing_rows)}")


Missing values by column:
ended_at                  0
start_station_name    11813
member_casual             0
end_lng                5042
started_at                0
start_station_id      11813
end_station_name      43726
start_lat                 0
start_lng                 0
end_lat                5042
end_station_id        46620
ride_id                   0
rideable_type             0
dtype: int64

Number of rows with missing values: 51886


In [27]:
# Remove rows with missing values
df_cleaned = df.dropna()

# Check the shape of the original and cleaned dataframes
print(f"Original dataframe shape: {df.shape}")
print(f"Cleaned dataframe shape: {df_cleaned.shape}")
print(f"Number of rows removed: {df.shape[0] - df_cleaned.shape[0]}")

# Save the cleaned dataframe to a new CSV file
output_path = 'data/cleaned/202401-202406-citibike-tripdata-cleaned.csv'
df_cleaned.to_csv(output_path, index=False)

print(f"Cleaned data saved to: {output_path}")

Original dataframe shape: (18903880, 13)
Cleaned dataframe shape: (18851994, 13)
Number of rows removed: 51886
Cleaned data saved to: data/cleaned/202401-202406-citibike-tripdata-cleaned.csv


In [29]:
file_path = 'data/cleaned/202401-202406-citibike-tripdata-cleaned.csv'
# Specify that these columns should be read as strings
df = pd.read_csv(file_path, dtype={'start_station_id': str, 'end_station_id': str})

In [30]:
# Check missing values
print("\nMissing values by column:")
print(df.isnull().sum())

# Filter rows with missing values (if any)
missing_rows = df[df.isnull().any(axis=1)]
print(f"\nNumber of rows with missing values: {len(missing_rows)}")


Missing values by column:
ended_at              0
start_station_name    0
member_casual         0
end_lng               0
started_at            0
start_station_id      0
end_station_name      0
start_lat             0
start_lng             0
end_lat               0
end_station_id        0
ride_id               0
rideable_type         0
dtype: int64

Number of rows with missing values: 0
