### Import of used packages

## Read and clean data

We downloaded CoGo bicycle trip data from the CoGoBike website. The data is available as CSV for each month. For our project, we use the bicycle trips of the following months and years:
  -- July 2020
  -- December 2020
  -- July 2021
  -- December 2021
  -- July 2022
  -- December 2022
  -- July  2023
  -- December 2023

Data source: https://cogo-sys-data.s3.amazonaws.com/index.html

### Read in and clean data, Calculate trip durations

In [1]:
import os
import csv
from datetime import datetime
import pandas as pd


In [None]:
class Cogo_Tripdata:
    def __init__(self, input_path, output_path):
        self.input_path = input_path
        self.output_path = output_path
        self.count_original = 0 
        self.count_clean = 0
        self.filelist = os.listdir(input_path)

    # Read the original csv-bike trip data and add each row into a dictionary, wich then are stored within a list
    def read_csv(self, file_path):
        data = []
        with open(file_path, 'r') as file:
            csv_reader = csv.DictReader(file)
            for row in csv_reader:
                data.append(row)
        return data
    
    # List of rows is converted back into csv with the given attributes as headers
    def write_csv(self, file_path, data, fieldnames):
        with open(file_path, 'w', newline='') as file:
            csv_writer = csv.DictWriter(file, fieldnames=fieldnames)
            csv_writer.writeheader()
            csv_writer.writerows(data)

    # Starting and ending trip timestamps are stored as floats. Those are converted into date format to calculate the trip durations.
    # Trip durations are calculated in minutes and added as new attribute 'trip_duration [min]'
    def calculate_trip_duration(self, row):
        start = datetime.strptime(row['started_at'], '%Y-%m-%d %H:%M:%S')
        end = datetime.strptime(row['ended_at'], '%Y-%m-%d %H:%M:%S')  
        trip_duration = int((end - start).total_seconds() / 60) 
        row['trip_duration [min]'] = trip_duration 
        return row

    # The original data consists of corrupt data. To get reliable results:
    # We set a maximum trip duration of 100 min (based on histogram plots of all trip durations).
    # We delete trips that lasts less than 10 minutes AND end at the same coordinates as they start
    # We delete trips that have no coordinate values
    def clean_data(self, data):
        max_duration = 100
        min_duration = 10
        cleaned_data = []

        for row in data:
            if ((row['trip_duration [min]'] < max_duration and
                row['start_lat'] and row['start_lng'] and row['end_lat'] and row['end_lng']) 
                and not
                (row['trip_duration [min]'] < min_duration and row['start_lat'] == row['end_lat'] and row['start_lng'] == row['end_lng'])):
                cleaned_data.append(row)

        return cleaned_data

    # Reads in, writes, calculates trip distances and cleans data for each of the original csv-files. 
    # Storing the output files in a new folder, which we use for further processing. 
    def process_files(self):
        for f in self.filelist:
            source = os.path.join(self.input_path, f)
            data = self.read_csv(source)
            self.count_original += len(data)

            data_with_trip_duration = [self.calculate_trip_duration(row) for row in data]

            cleaned_data = self.clean_data(data_with_trip_duration)
            self.count_clean += len(cleaned_data)

            output_file = os.path.join(self.output_path, "cleaned" + f)
            self.write_csv(output_file, cleaned_data, fieldnames=data[0].keys())

        removed = self.count_original - self.count_clean
        share_removed = round((removed / self.count_original) * 100, 3)

        print(f"From a total number of {self.count_original} entries, {removed} entries ({share_removed} %) have been removed.")

input_path = "Documents/GitHub/GEO877-FS24-McKenzie/data/original_data/"
output_path = "Documents/GitHub/GEO877-FS24-McKenzie/data/cleaned_data/"
cogo_datacleaning = Cogo_Tripdata(input_path, output_path)
cogo_datacleaning.process_files()


The following code merges different csv files together. To for example create the files for Summer and Winter.

In [2]:
# Enter path to the folder with cleaned data here.
data = "/Users/benedikt/Documents/GitHub/GEO877-FS24-McKenzie/data/cleaned_data/"

# Enter a string that should be found in the file name. To find all files from July, enter "07-cogo"
# and to find all files for December enter "12-cogo".
look_for = "07-cogo"
data_files = os.listdir(data)
file_list = []

# Filter the files that match the criteria and add them to the file_list
for name in data_files:
    if look_for in name:
        file_list.append(os.path.join(data, name)) 

# empty list to store dataframes
merge_list = []

# Read all files from file_list and append them to merge_list.
for path in file_list:
    file = pd.read_csv(path)
    merge_list.append(file)

# merge all the files together
merged_file = pd.concat(merge_list)

# Save it to a csv
merged_file.to_csv(os.path.join(data, 'cleaned_July-cogo-tripdata.csv'), index=False)
