# Data Cleaning and Processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob

print("Done")

Done


In [2]:
# Loading data
df = pd.concat(map(pd.read_csv, glob.glob("Data/CSV/" + "*.csv")))

df.head(10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,99FEC93BA843FB20,electric_bike,2021-06-13 14:31:28,2021-06-13 14:34:11,,,,,41.8,-87.59,41.8,-87.6,member
1,06048DCFC8520CAF,electric_bike,2021-06-04 11:18:02,2021-06-04 11:24:19,,,,,41.79,-87.59,41.8,-87.6,member
2,9598066F68045DF2,electric_bike,2021-06-04 09:49:35,2021-06-04 09:55:34,,,,,41.8,-87.6,41.79,-87.59,member
3,B03C0FE48C412214,electric_bike,2021-06-03 19:56:05,2021-06-03 20:21:55,,,,,41.78,-87.58,41.8,-87.6,member
4,B9EEA89F8FEE73B7,electric_bike,2021-06-04 14:05:51,2021-06-04 14:09:59,,,,,41.8,-87.59,41.79,-87.59,member
5,62B943CEAAA420BA,electric_bike,2021-06-03 19:32:01,2021-06-03 19:38:46,,,,,41.78,-87.58,41.78,-87.58,member
6,7E2546FBA79C46EE,electric_bike,2021-06-10 16:30:10,2021-06-10 16:36:21,,,,,41.79,-87.6,41.79,-87.59,member
7,3DDF3BBF6C4C3C89,electric_bike,2021-06-10 17:00:30,2021-06-10 17:06:48,,,,,41.79,-87.59,41.8,-87.59,member
8,2608805637155AB6,electric_bike,2021-06-10 12:46:16,2021-06-10 12:55:02,,,,,41.93,-87.67,41.94,-87.68,member
9,AF529C946F28ED42,electric_bike,2021-06-23 17:57:29,2021-06-23 18:06:40,,,Michigan Ave & Oak St,13042.0,41.88,-87.61,41.901052,-87.623698,member


In [3]:
# Ensuring dates are parsed as dates then
# extracting day_of_week from those dates.
# 0 = Monday, 6 = Sunday

df["started_at"] = pd.to_datetime(df["started_at"])
df["ended_at"] = pd.to_datetime(df["ended_at"])
df["day_of_week"] = pd.to_datetime(df["started_at"]).dt.dayofweek

df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,day_of_week
0,99FEC93BA843FB20,electric_bike,2021-06-13 14:31:28,2021-06-13 14:34:11,,,,,41.8,-87.59,41.8,-87.6,member,6
1,06048DCFC8520CAF,electric_bike,2021-06-04 11:18:02,2021-06-04 11:24:19,,,,,41.79,-87.59,41.8,-87.6,member,4
2,9598066F68045DF2,electric_bike,2021-06-04 09:49:35,2021-06-04 09:55:34,,,,,41.8,-87.6,41.79,-87.59,member,4
3,B03C0FE48C412214,electric_bike,2021-06-03 19:56:05,2021-06-03 20:21:55,,,,,41.78,-87.58,41.8,-87.6,member,3
4,B9EEA89F8FEE73B7,electric_bike,2021-06-04 14:05:51,2021-06-04 14:09:59,,,,,41.8,-87.59,41.79,-87.59,member,4


In [4]:
# Calculating ride length in HH:MM:SS

# Convert datetime columns to numpy arrays
start_time = df["started_at"].values
end_time   = df["ended_at"].values

# Calculating ride length
ride_duration = np.subtract(end_time, start_time)

# Adding the resulting array of trip durations to a new column in the original dataframe
df["ride_duration"] = pd.Series(ride_duration)

In [5]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,day_of_week,ride_duration
0,99FEC93BA843FB20,electric_bike,2021-06-13 14:31:28,2021-06-13 14:34:11,,,,,41.8,-87.59,41.8,-87.6,member,6,0 days 00:02:43
1,06048DCFC8520CAF,electric_bike,2021-06-04 11:18:02,2021-06-04 11:24:19,,,,,41.79,-87.59,41.8,-87.6,member,4,0 days 00:06:17
2,9598066F68045DF2,electric_bike,2021-06-04 09:49:35,2021-06-04 09:55:34,,,,,41.8,-87.6,41.79,-87.59,member,4,0 days 00:05:59
3,B03C0FE48C412214,electric_bike,2021-06-03 19:56:05,2021-06-03 20:21:55,,,,,41.78,-87.58,41.8,-87.6,member,3,0 days 00:25:50
4,B9EEA89F8FEE73B7,electric_bike,2021-06-04 14:05:51,2021-06-04 14:09:59,,,,,41.8,-87.59,41.79,-87.59,member,4,0 days 00:04:08


In [6]:
df.to_csv("Data/out/processed.csv", index=False)
print("Exported Successfully")

Exported Successfully
