## Import modules

In [1]:
import pandas as pd
import glob
from postgresql_tools import create_db_conn, create_table, insert_data_to_db

## Create table in local Postgres database

In [2]:
# Create connection to PostgreSQL database
conn = create_db_conn('config.ini')

# Create tables in the database
create_table('create_trips_table.sql', conn)
create_table('create_station_table.sql', conn)

## Import CSV files and clean data

In [3]:
# Define function to clean and transform data
def clean_data(df):
    df['started_at'] = pd.to_datetime(df['started_at'], format='%Y-%m-%d %H:%M:%S')
    df['ended_at'] = pd.to_datetime(df['ended_at'], format='%Y-%m-%d %H:%M:%S')
    df['ride_time'] = pd.to_timedelta(df['ended_at'] - df['started_at'], unit='minutes')
    df = df[df['ride_time'] >= pd.Timedelta(minutes=1)]
    df = df[['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_id',
             'end_station_id', 'member_casual', 'ride_time']]
    return df

# Extract station ids and coordinates from dataframe
def get_stations(df):
    start_stations = df.groupby(['start_station_name', 'start_station_id']).agg({
        'start_lat': 'first',
        'start_lng': 'first'
    }).reset_index()
    end_stations = df.groupby(['end_station_name', 'end_station_id']).agg({
        'end_lat': 'first',
        'end_lng': 'first'
    }).reset_index()
    start_stations = start_stations.rename(columns={
        'start_station_name': 'station_name',
        'start_station_id': 'station_id',
        'start_lat': 'latitude',
        'start_lng': 'longitude'
    })
    end_stations = end_stations.rename(columns={
        'end_station_name': 'station_name',
        'end_station_id': 'station_id',
        'end_lat': 'latitude',
        'end_lng': 'longitude'
    })
    df = pd.concat([start_stations, end_stations]).drop_duplicates().reset_index(drop=True)
    return df

# Read CSV files into pandas dataframes and concatenate into a single dataframe
src_dir = './src/*.csv'
src_files = glob.glob(src_dir)
df_list = []
for file in src_files:
    df = pd.read_csv(file, parse_dates=['started_at','ended_at'])
    df_list.append(df)
df = pd.concat(df_list, ignore_index=True)

# Clean and transform data

stations = get_stations(df)
df = clean_data(df)

## Write dataframes to databases

In [9]:
# Write the dataframes to PostgreSQL database
insert_data_to_db(conn, df, 'trips')

insert_data_to_db(conn, stations, 'stations')

In [10]:
conn.close()