In [1]:
####################################################################
##  
##  ETL PROJECT - TEAM 5
##          - Fatma Butun (Chicago Data)
##          - George Alonzo (Los Angeles Data)
##
##
####################################################################

In [2]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# OVERVIEW

>This project will perform the ETL process on 2 CSV files relating to 2020Q1 bike share data from the City of Chicago and the City of Los Angeles.  We will begin by loading each CSV file into separate, initial dataframes.  From there, we will begin the transformation process by focusing on only the required columns and add unique prefixes and static city code to differentiate between the two data sets once they are loaded into the same table.  Date/timestamps will be split into two separate columns and the duration will be calculated for where it is not already provided.  Columns will be renamed and reordered to remain consistent as we prep to insert into the SQL tables.

>In addition, we will also perform the ETL process to store station information separately into its own SQL table to avoid redundant data.  Since the station IDs are defined as integers in both files, we will add a city prefix to the station ID to ensure they are identifyable to the city in which they belong.


# EXTRACT

## *LOS ANGELES*

In [3]:
# Read-in Los Angeles ride share data into an initial dataframe
la_csv = 'data/metro-bike-share-trips-2020-q1.csv'
la_csv_df = pd.read_csv(la_csv)
la_csv_df.head()

Unnamed: 0,trip_id,duration,start_time,end_time,start_station,start_lat,start_lon,end_station,end_lat,end_lon,bike_id,plan_duration,trip_route_category,passholder_type,bike_type
0,134867493,25,1/1/2020 0:16,1/1/2020 0:41,3063,34.048038,-118.253738,4491,34.04744,-118.24794,18419,30,One Way,Monthly Pass,electric
1,134867799,35,1/1/2020 0:24,1/1/2020 0:59,4285,,,4354,34.017681,-118.409081,15661,1,One Way,One Day Pass,smart
2,134868104,37,1/1/2020 0:31,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,15848,1,One Way,Walk-up,smart
3,134868103,36,1/1/2020 0:32,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,16053,1,One Way,Walk-up,smart
4,134868102,35,1/1/2020 0:33,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,15694,1,One Way,Walk-up,smart


## *CHICAGO*

In [4]:
# reda the csv into a dataframe
chi_bikeshare = pd.read_csv("data/Divvy_Trips_2020_Q1.csv")

In [5]:
# check dataframe
chi_bikeshare.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,239,Clark St & Leland Ave,326.0,41.9665,-87.6884,41.9671,-87.6674,member
1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,234,Southport Ave & Irving Park Rd,318.0,41.9616,-87.666,41.9542,-87.6644,member
2,789F3C21E472CA96,docked_bike,2020-01-09 19:29:26,2020-01-09 19:32:17,Broadway & Belmont Ave,296,Wilton Ave & Belmont Ave,117.0,41.9401,-87.6455,41.9402,-87.653,member
3,C9A388DAC6ABF313,docked_bike,2020-01-06 16:17:07,2020-01-06 16:25:56,Clark St & Randolph St,51,Fairbanks Ct & Grand Ave,24.0,41.8846,-87.6319,41.8918,-87.6206,member
4,943BC3CBECCFD662,docked_bike,2020-01-30 08:37:16,2020-01-30 08:42:48,Clinton St & Lake St,66,Wells St & Hubbard St,212.0,41.8856,-87.6418,41.8899,-87.6343,member


# TRANSFORM

## *LOS ANGELES*

##### *Bike Share Data*

In [6]:
# Prefix the station IDs with a city code to make the station IDs unique.
#   Add static city_name column
la_csv_df['city_name'] = 'Los Angeles'
la_csv_df['start_station'] = 'LA-'+la_csv_df['start_station'].astype(str)
la_csv_df['end_station'] = 'LA-'+la_csv_df['end_station'].astype(str)
la_csv_df.head()

Unnamed: 0,trip_id,duration,start_time,end_time,start_station,start_lat,start_lon,end_station,end_lat,end_lon,bike_id,plan_duration,trip_route_category,passholder_type,bike_type,city_name
0,134867493,25,1/1/2020 0:16,1/1/2020 0:41,LA-3063,34.048038,-118.253738,LA-4491,34.04744,-118.24794,18419,30,One Way,Monthly Pass,electric,Los Angeles
1,134867799,35,1/1/2020 0:24,1/1/2020 0:59,LA-4285,,,LA-4354,34.017681,-118.409081,15661,1,One Way,One Day Pass,smart,Los Angeles
2,134868104,37,1/1/2020 0:31,1/1/2020 1:08,LA-4344,34.014309,-118.491341,LA-4322,34.005871,-118.429161,15848,1,One Way,Walk-up,smart,Los Angeles
3,134868103,36,1/1/2020 0:32,1/1/2020 1:08,LA-4344,34.014309,-118.491341,LA-4322,34.005871,-118.429161,16053,1,One Way,Walk-up,smart,Los Angeles
4,134868102,35,1/1/2020 0:33,1/1/2020 1:08,LA-4344,34.014309,-118.491341,LA-4322,34.005871,-118.429161,15694,1,One Way,Walk-up,smart,Los Angeles


In [7]:
# Slim-down the dataframe to just the required columns for the rideshare table
la_slim_df = la_csv_df[['trip_id','start_time','end_time','start_station','end_station','duration','passholder_type','city_name']].copy()
la_slim_df.head()

Unnamed: 0,trip_id,start_time,end_time,start_station,end_station,duration,passholder_type,city_name
0,134867493,1/1/2020 0:16,1/1/2020 0:41,LA-3063,LA-4491,25,Monthly Pass,Los Angeles
1,134867799,1/1/2020 0:24,1/1/2020 0:59,LA-4285,LA-4354,35,One Day Pass,Los Angeles
2,134868104,1/1/2020 0:31,1/1/2020 1:08,LA-4344,LA-4322,37,Walk-up,Los Angeles
3,134868103,1/1/2020 0:32,1/1/2020 1:08,LA-4344,LA-4322,36,Walk-up,Los Angeles
4,134868102,1/1/2020 0:33,1/1/2020 1:08,LA-4344,LA-4322,35,Walk-up,Los Angeles


In [8]:
# Rename columns to be consistent amongst the two data sources
la_renamed_df = la_slim_df.rename(columns={'trip_id': 'ride_id', 'start_time':'start_dt','end_time':'end_dt','start_station': 'start_station_id','end_station':'end_station_id','passholder_type':'membership_type'})
la_renamed_df.head()

Unnamed: 0,ride_id,start_dt,end_dt,start_station_id,end_station_id,duration,membership_type,city_name
0,134867493,1/1/2020 0:16,1/1/2020 0:41,LA-3063,LA-4491,25,Monthly Pass,Los Angeles
1,134867799,1/1/2020 0:24,1/1/2020 0:59,LA-4285,LA-4354,35,One Day Pass,Los Angeles
2,134868104,1/1/2020 0:31,1/1/2020 1:08,LA-4344,LA-4322,37,Walk-up,Los Angeles
3,134868103,1/1/2020 0:32,1/1/2020 1:08,LA-4344,LA-4322,36,Walk-up,Los Angeles
4,134868102,1/1/2020 0:33,1/1/2020 1:08,LA-4344,LA-4322,35,Walk-up,Los Angeles


In [9]:
# Split start start & end datetime columns into separate date & time columns
la_renamed_df['start_date'] = pd.to_datetime(la_renamed_df['start_dt']).dt.date
la_renamed_df['start_time'] = pd.to_datetime(la_renamed_df['start_dt']).dt.time
la_renamed_df['end_date'] = pd.to_datetime(la_renamed_df['end_dt']).dt.date
la_renamed_df['end_time'] = pd.to_datetime(la_renamed_df['end_dt']).dt.time
la_renamed_df.head()

Unnamed: 0,ride_id,start_dt,end_dt,start_station_id,end_station_id,duration,membership_type,city_name,start_date,start_time,end_date,end_time
0,134867493,1/1/2020 0:16,1/1/2020 0:41,LA-3063,LA-4491,25,Monthly Pass,Los Angeles,2020-01-01,00:16:00,2020-01-01,00:41:00
1,134867799,1/1/2020 0:24,1/1/2020 0:59,LA-4285,LA-4354,35,One Day Pass,Los Angeles,2020-01-01,00:24:00,2020-01-01,00:59:00
2,134868104,1/1/2020 0:31,1/1/2020 1:08,LA-4344,LA-4322,37,Walk-up,Los Angeles,2020-01-01,00:31:00,2020-01-01,01:08:00
3,134868103,1/1/2020 0:32,1/1/2020 1:08,LA-4344,LA-4322,36,Walk-up,Los Angeles,2020-01-01,00:32:00,2020-01-01,01:08:00
4,134868102,1/1/2020 0:33,1/1/2020 1:08,LA-4344,LA-4322,35,Walk-up,Los Angeles,2020-01-01,00:33:00,2020-01-01,01:08:00


In [10]:
# Drop the original start & end datetimestamp columns
la_renamed_df = la_renamed_df.drop(['start_dt'],axis=1)
la_renamed_df = la_renamed_df.drop(['end_dt'],axis=1)
la_renamed_df.head()

Unnamed: 0,ride_id,start_station_id,end_station_id,duration,membership_type,city_name,start_date,start_time,end_date,end_time
0,134867493,LA-3063,LA-4491,25,Monthly Pass,Los Angeles,2020-01-01,00:16:00,2020-01-01,00:41:00
1,134867799,LA-4285,LA-4354,35,One Day Pass,Los Angeles,2020-01-01,00:24:00,2020-01-01,00:59:00
2,134868104,LA-4344,LA-4322,37,Walk-up,Los Angeles,2020-01-01,00:31:00,2020-01-01,01:08:00
3,134868103,LA-4344,LA-4322,36,Walk-up,Los Angeles,2020-01-01,00:32:00,2020-01-01,01:08:00
4,134868102,LA-4344,LA-4322,35,Walk-up,Los Angeles,2020-01-01,00:33:00,2020-01-01,01:08:00


In [11]:
# Reorder fields to be consistent amongst the two dataframes
la_bikeshare_df = la_renamed_df[['ride_id','start_station_id','start_date','start_time',
                                 'end_station_id','end_date','end_time','duration','membership_type',
                                'city_name']]
# Add static city name to differentiate records within the same table in the database
la_bikeshare_df.head()

Unnamed: 0,ride_id,start_station_id,start_date,start_time,end_station_id,end_date,end_time,duration,membership_type,city_name
0,134867493,LA-3063,2020-01-01,00:16:00,LA-4491,2020-01-01,00:41:00,25,Monthly Pass,Los Angeles
1,134867799,LA-4285,2020-01-01,00:24:00,LA-4354,2020-01-01,00:59:00,35,One Day Pass,Los Angeles
2,134868104,LA-4344,2020-01-01,00:31:00,LA-4322,2020-01-01,01:08:00,37,Walk-up,Los Angeles
3,134868103,LA-4344,2020-01-01,00:32:00,LA-4322,2020-01-01,01:08:00,36,Walk-up,Los Angeles
4,134868102,LA-4344,2020-01-01,00:33:00,LA-4322,2020-01-01,01:08:00,35,Walk-up,Los Angeles


##### *Station Data*

In [43]:
# Slim-down the dataframe to just the required columns for the station table
la_start_stations= la_csv_df[['start_station','start_lat','start_lon']]
la_start_stations = la_start_stations.rename(columns={'start_station': 'station_id', 'start_lat': 'latitude','start_lon': 'longitude'})


la_start_stations.head()

Unnamed: 0,station_id,latitude,longitude
0,LA-3063,34.048038,-118.253738
1,LA-4285,,
2,LA-4344,34.014309,-118.491341
3,LA-4344,34.014309,-118.491341
4,LA-4344,34.014309,-118.491341


In [44]:
# Slim-down the dataframe to just the required columns for the station table
la_end_stations= la_csv_df[['end_station','end_lat','end_lon']]
la_end_stations = la_end_stations.rename(columns={'end_station': 'station_id', 'end_lat': 'latitude','end_lon': 'longitude'})
la_end_stations.head()

Unnamed: 0,station_id,latitude,longitude
0,LA-4491,34.04744,-118.24794
1,LA-4354,34.017681,-118.409081
2,LA-4322,34.005871,-118.429161
3,LA-4322,34.005871,-118.429161
4,LA-4322,34.005871,-118.429161


In [45]:
# Concatenate both the start & end dataframes into a single df
la_station_df = pd.concat([la_start_stations,la_end_stations])
# Drop any duplicates
la_station_df = la_station_df.drop_duplicates()
# Drop any NaN values
#la_station_df = la_station_df.dropna()
# Reset index to station_id
#la_station_df.set_index(['station_id'],inplace=True)
la_station_df

Unnamed: 0,station_id,latitude,longitude
0,LA-3063,34.048038,-118.253738
1,LA-4285,,
2,LA-4344,34.014309,-118.491341
5,LA-4472,34.092602,-118.280930
7,LA-3082,34.046520,-118.237411
...,...,...,...
49005,LA-4508,34.017941,-118.255417
53442,LA-4503,33.989700,-118.448883
59277,LA-4496,33.972980,-118.423943
63368,LA-4500,34.172100,-118.361816


## *CHICAGO*

In [15]:
# rename columns to have them in unisome with the other dataframe

chi_bikeshare = chi_bikeshare.rename(columns={"started_at":"start_time", "ended_at":"end_time", "member_casual":"membership_type"})

In [16]:
# append CHI- to the station IDs to prevent any duplicate IDs between two different dataframes

chi_bikeshare['start_station_id'] = 'CHI-' + chi_bikeshare['start_station_id'].astype(str) 
chi_bikeshare['end_station_id'] = 'CHI-' + chi_bikeshare['end_station_id'].astype(str) 
chi_bikeshare.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,membership_type
0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,CHI-239,Clark St & Leland Ave,CHI-326.0,41.9665,-87.6884,41.9671,-87.6674,member
1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,CHI-234,Southport Ave & Irving Park Rd,CHI-318.0,41.9616,-87.666,41.9542,-87.6644,member
2,789F3C21E472CA96,docked_bike,2020-01-09 19:29:26,2020-01-09 19:32:17,Broadway & Belmont Ave,CHI-296,Wilton Ave & Belmont Ave,CHI-117.0,41.9401,-87.6455,41.9402,-87.653,member
3,C9A388DAC6ABF313,docked_bike,2020-01-06 16:17:07,2020-01-06 16:25:56,Clark St & Randolph St,CHI-51,Fairbanks Ct & Grand Ave,CHI-24.0,41.8846,-87.6319,41.8918,-87.6206,member
4,943BC3CBECCFD662,docked_bike,2020-01-30 08:37:16,2020-01-30 08:42:48,Clinton St & Lake St,CHI-66,Wells St & Hubbard St,CHI-212.0,41.8856,-87.6418,41.8899,-87.6343,member


Make a table that contains all the station IDs and their latitude and longitude information

In [17]:
# extract relavant columns for the station id df
chi_station_id_s= chi_bikeshare[["start_station_id", "start_lat", "start_lng"]]
chi_station_id_e= chi_bikeshare[["end_station_id", "end_lat", "end_lng"]]

In [19]:
# rename column names so that both station df will have the same columns
chi_station_id_s =chi_station_id_s.rename(columns={"start_station_id":"station_id", "start_lat":"latitude","start_lng": "longitude"})

In [20]:
#rename column names so that both station df will have the same columns
chi_station_id_e= chi_station_id_e.rename(columns={"end_station_id":"station_id", "end_lat":"latitude","end_lng": "longitude"})

In [22]:
# combine two df in one and drop the duplicate station_ids
chi_station_id = chi_station_id_e.append(chi_station_id_s)
chi_station_id = chi_station_id.drop_duplicates(subset=['station_id'])
chi_station_id["station_id"].nunique()

1210

In [23]:
chi_station_id.head()

Unnamed: 0,station_id,latitude,longitude
0,CHI-326.0,41.9671,-87.6674
1,CHI-318.0,41.9542,-87.6644
2,CHI-117.0,41.9402,-87.653
3,CHI-24.0,41.8918,-87.6206
4,CHI-212.0,41.8899,-87.6343


Make a table that contains information about chicago bike share. The dataframe will include information about ride_id, start_station_id, start_date, start_time, end_station_id, end_date, end_time, duration , membership_type and city_name.

In [24]:
# continue with the main bikeshare dataframe.  keep relavant columns

chi_bikeshare =chi_bikeshare[["ride_id", "start_time", "end_time", "start_station_id", "end_station_id","membership_type" ]]

In [25]:
chi_bikeshare.head()

Unnamed: 0,ride_id,start_time,end_time,start_station_id,end_station_id,membership_type
0,EACB19130B0CDA4A,2020-01-21 20:06:59,2020-01-21 20:14:30,CHI-239,CHI-326.0,member
1,8FED874C809DC021,2020-01-30 14:22:39,2020-01-30 14:26:22,CHI-234,CHI-318.0,member
2,789F3C21E472CA96,2020-01-09 19:29:26,2020-01-09 19:32:17,CHI-296,CHI-117.0,member
3,C9A388DAC6ABF313,2020-01-06 16:17:07,2020-01-06 16:25:56,CHI-51,CHI-24.0,member
4,943BC3CBECCFD662,2020-01-30 08:37:16,2020-01-30 08:42:48,CHI-66,CHI-212.0,member


In [26]:
#check df
chi_bikeshare.head()

Unnamed: 0,ride_id,start_time,end_time,start_station_id,end_station_id,membership_type
0,EACB19130B0CDA4A,2020-01-21 20:06:59,2020-01-21 20:14:30,CHI-239,CHI-326.0,member
1,8FED874C809DC021,2020-01-30 14:22:39,2020-01-30 14:26:22,CHI-234,CHI-318.0,member
2,789F3C21E472CA96,2020-01-09 19:29:26,2020-01-09 19:32:17,CHI-296,CHI-117.0,member
3,C9A388DAC6ABF313,2020-01-06 16:17:07,2020-01-06 16:25:56,CHI-51,CHI-24.0,member
4,943BC3CBECCFD662,2020-01-30 08:37:16,2020-01-30 08:42:48,CHI-66,CHI-212.0,member


In [27]:
# make duplicate columns of start_time and end_time to use later

chi_bikeshare["start_tm"] = chi_bikeshare["start_time"]
chi_bikeshare["end_tm"] = chi_bikeshare["end_time"]

In [28]:
# Change the start_time and end_time columns into datetime64 in order to calculate the duration
chi_bikeshare["end_time"] = pd.to_datetime(chi_bikeshare["end_time"])
chi_bikeshare["start_time"] = pd.to_datetime(chi_bikeshare["start_time"])

chi_bikeshare.head()

Unnamed: 0,ride_id,start_time,end_time,start_station_id,end_station_id,membership_type,start_tm,end_tm
0,EACB19130B0CDA4A,2020-01-21 20:06:59,2020-01-21 20:14:30,CHI-239,CHI-326.0,member,2020-01-21 20:06:59,2020-01-21 20:14:30
1,8FED874C809DC021,2020-01-30 14:22:39,2020-01-30 14:26:22,CHI-234,CHI-318.0,member,2020-01-30 14:22:39,2020-01-30 14:26:22
2,789F3C21E472CA96,2020-01-09 19:29:26,2020-01-09 19:32:17,CHI-296,CHI-117.0,member,2020-01-09 19:29:26,2020-01-09 19:32:17
3,C9A388DAC6ABF313,2020-01-06 16:17:07,2020-01-06 16:25:56,CHI-51,CHI-24.0,member,2020-01-06 16:17:07,2020-01-06 16:25:56
4,943BC3CBECCFD662,2020-01-30 08:37:16,2020-01-30 08:42:48,CHI-66,CHI-212.0,member,2020-01-30 08:37:16,2020-01-30 08:42:48


In [29]:
# check if the transformation worked
chi_bikeshare.dtypes

ride_id                     object
start_time          datetime64[ns]
end_time            datetime64[ns]
start_station_id            object
end_station_id              object
membership_type             object
start_tm                    object
end_tm                      object
dtype: object

In [30]:
# Calculate the duration of each bike ride
chi_bikeshare["duration"] = (chi_bikeshare["end_time"] - chi_bikeshare["start_time"]).astype('timedelta64[m]')
chi_bikeshare.head()

Unnamed: 0,ride_id,start_time,end_time,start_station_id,end_station_id,membership_type,start_tm,end_tm,duration
0,EACB19130B0CDA4A,2020-01-21 20:06:59,2020-01-21 20:14:30,CHI-239,CHI-326.0,member,2020-01-21 20:06:59,2020-01-21 20:14:30,7.0
1,8FED874C809DC021,2020-01-30 14:22:39,2020-01-30 14:26:22,CHI-234,CHI-318.0,member,2020-01-30 14:22:39,2020-01-30 14:26:22,3.0
2,789F3C21E472CA96,2020-01-09 19:29:26,2020-01-09 19:32:17,CHI-296,CHI-117.0,member,2020-01-09 19:29:26,2020-01-09 19:32:17,2.0
3,C9A388DAC6ABF313,2020-01-06 16:17:07,2020-01-06 16:25:56,CHI-51,CHI-24.0,member,2020-01-06 16:17:07,2020-01-06 16:25:56,8.0
4,943BC3CBECCFD662,2020-01-30 08:37:16,2020-01-30 08:42:48,CHI-66,CHI-212.0,member,2020-01-30 08:37:16,2020-01-30 08:42:48,5.0


In [31]:
# add the city name column and fill with "Chicago"
chi_bikeshare["city_name"] = "Chicago"
chi_bikeshare.head()

Unnamed: 0,ride_id,start_time,end_time,start_station_id,end_station_id,membership_type,start_tm,end_tm,duration,city_name
0,EACB19130B0CDA4A,2020-01-21 20:06:59,2020-01-21 20:14:30,CHI-239,CHI-326.0,member,2020-01-21 20:06:59,2020-01-21 20:14:30,7.0,Chicago
1,8FED874C809DC021,2020-01-30 14:22:39,2020-01-30 14:26:22,CHI-234,CHI-318.0,member,2020-01-30 14:22:39,2020-01-30 14:26:22,3.0,Chicago
2,789F3C21E472CA96,2020-01-09 19:29:26,2020-01-09 19:32:17,CHI-296,CHI-117.0,member,2020-01-09 19:29:26,2020-01-09 19:32:17,2.0,Chicago
3,C9A388DAC6ABF313,2020-01-06 16:17:07,2020-01-06 16:25:56,CHI-51,CHI-24.0,member,2020-01-06 16:17:07,2020-01-06 16:25:56,8.0,Chicago
4,943BC3CBECCFD662,2020-01-30 08:37:16,2020-01-30 08:42:48,CHI-66,CHI-212.0,member,2020-01-30 08:37:16,2020-01-30 08:42:48,5.0,Chicago


In [32]:
a = chi_bikeshare
a.head()

Unnamed: 0,ride_id,start_time,end_time,start_station_id,end_station_id,membership_type,start_tm,end_tm,duration,city_name
0,EACB19130B0CDA4A,2020-01-21 20:06:59,2020-01-21 20:14:30,CHI-239,CHI-326.0,member,2020-01-21 20:06:59,2020-01-21 20:14:30,7.0,Chicago
1,8FED874C809DC021,2020-01-30 14:22:39,2020-01-30 14:26:22,CHI-234,CHI-318.0,member,2020-01-30 14:22:39,2020-01-30 14:26:22,3.0,Chicago
2,789F3C21E472CA96,2020-01-09 19:29:26,2020-01-09 19:32:17,CHI-296,CHI-117.0,member,2020-01-09 19:29:26,2020-01-09 19:32:17,2.0,Chicago
3,C9A388DAC6ABF313,2020-01-06 16:17:07,2020-01-06 16:25:56,CHI-51,CHI-24.0,member,2020-01-06 16:17:07,2020-01-06 16:25:56,8.0,Chicago
4,943BC3CBECCFD662,2020-01-30 08:37:16,2020-01-30 08:42:48,CHI-66,CHI-212.0,member,2020-01-30 08:37:16,2020-01-30 08:42:48,5.0,Chicago


In [33]:
# drop the start time and end time columns that are in datetype.
a= a.drop(columns=["start_time", "end_time"])
a.head()

Unnamed: 0,ride_id,start_station_id,end_station_id,membership_type,start_tm,end_tm,duration,city_name
0,EACB19130B0CDA4A,CHI-239,CHI-326.0,member,2020-01-21 20:06:59,2020-01-21 20:14:30,7.0,Chicago
1,8FED874C809DC021,CHI-234,CHI-318.0,member,2020-01-30 14:22:39,2020-01-30 14:26:22,3.0,Chicago
2,789F3C21E472CA96,CHI-296,CHI-117.0,member,2020-01-09 19:29:26,2020-01-09 19:32:17,2.0,Chicago
3,C9A388DAC6ABF313,CHI-51,CHI-24.0,member,2020-01-06 16:17:07,2020-01-06 16:25:56,8.0,Chicago
4,943BC3CBECCFD662,CHI-66,CHI-212.0,member,2020-01-30 08:37:16,2020-01-30 08:42:48,5.0,Chicago


In [34]:
# seperate the start and end tm columns into 2 columns each

a[['start_date', 'start_time']] = a['start_tm'].str.split(' ', 1, expand=True)
a[['end_date', 'end_time']] = a['end_tm'].str.split(' ', 1, expand=True)
a.head()

Unnamed: 0,ride_id,start_station_id,end_station_id,membership_type,start_tm,end_tm,duration,city_name,start_date,start_time,end_date,end_time
0,EACB19130B0CDA4A,CHI-239,CHI-326.0,member,2020-01-21 20:06:59,2020-01-21 20:14:30,7.0,Chicago,2020-01-21,20:06:59,2020-01-21,20:14:30
1,8FED874C809DC021,CHI-234,CHI-318.0,member,2020-01-30 14:22:39,2020-01-30 14:26:22,3.0,Chicago,2020-01-30,14:22:39,2020-01-30,14:26:22
2,789F3C21E472CA96,CHI-296,CHI-117.0,member,2020-01-09 19:29:26,2020-01-09 19:32:17,2.0,Chicago,2020-01-09,19:29:26,2020-01-09,19:32:17
3,C9A388DAC6ABF313,CHI-51,CHI-24.0,member,2020-01-06 16:17:07,2020-01-06 16:25:56,8.0,Chicago,2020-01-06,16:17:07,2020-01-06,16:25:56
4,943BC3CBECCFD662,CHI-66,CHI-212.0,member,2020-01-30 08:37:16,2020-01-30 08:42:48,5.0,Chicago,2020-01-30,08:37:16,2020-01-30,08:42:48


In [35]:
# keep only required columns and rearrange the order

chicago_bikeshare_df= a[["ride_id", "start_station_id", "start_date", "start_time", "end_station_id", "end_date", "end_time", "duration" , "membership_type", "city_name"]]

In [36]:
# check final dataframe
chicago_bikeshare_df.head()

Unnamed: 0,ride_id,start_station_id,start_date,start_time,end_station_id,end_date,end_time,duration,membership_type,city_name
0,EACB19130B0CDA4A,CHI-239,2020-01-21,20:06:59,CHI-326.0,2020-01-21,20:14:30,7.0,member,Chicago
1,8FED874C809DC021,CHI-234,2020-01-30,14:22:39,CHI-318.0,2020-01-30,14:26:22,3.0,member,Chicago
2,789F3C21E472CA96,CHI-296,2020-01-09,19:29:26,CHI-117.0,2020-01-09,19:32:17,2.0,member,Chicago
3,C9A388DAC6ABF313,CHI-51,2020-01-06,16:17:07,CHI-24.0,2020-01-06,16:25:56,8.0,member,Chicago
4,943BC3CBECCFD662,CHI-66,2020-01-30,08:37:16,CHI-212.0,2020-01-30,08:42:48,5.0,member,Chicago


# LOAD

In [46]:
# create a connection string to postgres
connection_string = "postgres:bootcamp@localhost:5432/bikeshare"
engine = create_engine(f'postgresql://{connection_string}')

In [47]:
# check the tables in thebikeshare database
engine.table_names()

  engine.table_names()


['station', 'bike_trip']

## *LOS ANGELES*

In [48]:
la_station_df.to_sql(name='station', con=engine, if_exists='append', index=False)

In [49]:
la_bikeshare_df.to_sql(name='bike_trip', con=engine, if_exists='append', index=False)

## *CHICAGO*

In [50]:
chi_station_id.to_sql(name='station', con=engine, if_exists='append', index=False)

In [51]:
chicago_bikeshare_df.to_sql(name='bike_trip', con=engine, if_exists='append', index=False)

# BASIC DATABASE CHECKING & VALIDATIONS

In [55]:
pd.read_sql_query('select * from bike_trip order by ride_id', con=engine).head()

Unnamed: 0,ride_id,start_station_id,start_date,start_time,end_station_id,end_date,end_time,duration,membership_type,city_name
0,000054ABAD1C067C,CHI-44,2020-03-03,18:08:41,CHI-175.0,2020-03-03,18:19:56,11.0,member,Chicago
1,0000D320A07EE21F,CHI-35,2020-02-11,06:44:18,CHI-66.0,2020-02-11,06:56:36,12.0,member,Chicago
2,0000D372025B3040,CHI-125,2020-02-10,16:19:18,CHI-91.0,2020-02-10,16:27:11,7.0,member,Chicago
3,00011A7CBF765993,CHI-224,2020-02-06,08:10:16,CHI-224.0,2020-02-06,08:10:18,0.0,member,Chicago
4,0001610640246DDB,CHI-127,2020-02-19,20:46:58,CHI-331.0,2020-02-19,20:56:35,9.0,member,Chicago


In [54]:
pd.read_sql_query('select * from station order by station_id', con=engine).head()

Unnamed: 0,station_id,latitude,longitude
0,CHI-100,41.8882,-87.6364
1,CHI-100.0,41.8882,-87.6364
2,CHI-101,41.781,-87.5761
3,CHI-101.0,41.781,-87.5761
4,CHI-102,41.7735,-87.5853
