In [681]:
import os
import re
import pandas as pd
import numpy as np
import psycopg2
import postgres_creds as cred
# Since changes were made in cred and our ipynb can't see new changes, we use Importlib to reload the module
import importlib
importlib.reload(cred)

<module 'postgres_creds' from '/Users/chewynguyen/Desktop/csv_postgres_connector/postgres_creds.py'>

In [682]:
conn = psycopg2.connect(
    host = cred.host,
    user = cred.user,
    password = cred.password,
    database = cred.database)
cursor = conn.cursor()

Testing Strategy

1. create df_10 dataframe
2. drop table, create table in DB with replaced object type, insert statement queries
3. create insert into function
4. Show table in DB funtion

Next Goal:

1. Automate code: csv's can be imported without manually changing code
2. Can upload multiple csv's at the same time

Next Steps:

0. Create new folder in current directory, if it exists, pass 
1. Add csv's in current directory to a list
2. Check cwd, if csv in cwd, move csv to new folder
3. Create dictionary with csv name as key and df as value
4. Look inside new folder and clean the csv names and their column names

In [683]:
# 0. Create new folder in current directory for csvs that have been processed, if it exists, pass
new_directory = "imported_csv"
try:
    os.mkdir(new_directory)
except:
    pass


In [684]:
# 1. add csv's in current directory to a list
# re.sub(r'[^\w\.]', '_', csv) substitutes all non word and num characters
csv_files = []
for csv in os.listdir(os.getcwd()):
    if '.csv' in csv:
        old_csv_name = str(os.getcwd() + '/' + csv)
        csv = re.sub(r'[^\w\.]', '_', csv).lower()
        # 2. Check cwd, if csv in cwd, move csv to new folder
        new_csv_name = str(os.getcwd() + '/' + new_directory + '/' + csv)
        if os.path.isfile(new_csv_name):
            print("The file already exists")
        else:
            # Rename the file
            os.rename(old_csv_name, new_csv_name)
        csv_files.append(csv)

print(csv_files)

['cities_test.csv', 'countriestest_10.csv']


In [685]:
# automating read csv to df
# 3. Create dictionary with csv name as key and df as value
df = {}
for csv in csv_files:
    csv_path = str(os.getcwd() + '/' + new_directory + '/' + csv)
    df[csv] = pd.read_csv(csv_path, index_col = 0)
print(csv_files) 


['cities_test.csv', 'countriestest_10.csv']


In [686]:
# 4. Name already clean but look inside new folder and their column names

for key in df:
    dataframe = df[key]
    clean_csv_name = re.sub(r'[^\w\.]', '_', key).lower()
    dataframe.columns = [re.sub(r'[^\w\.]', '_', column_name).lower() for column_name in dataframe.columns]
#dataframe.columns as a str
    dataframe_columns_insertable = ', '.join(dataframe.columns)


# Creates DB table name
    db_table_name = key.split('.')[0]

# Replacing pd datatypes with sql datatypes
    replacements = {
        'timedelta64[ns]': 'varchar(255)',
        'object': 'varchar(255)',
        'float64': 'float',
        'bool': 'boolean',
        'int64': 'int',
        'datetime64': 'timestamp'}
    replaced_dtypes = dataframe.dtypes.replace(replacements)
    # table schema
    column_dtype = ", ".join("{} {}".format(col_name, dtype) for (col_name, dtype) in zip(dataframe.columns, replaced_dtypes))
    

    # 2. create queries
    drop_table = 'DROP TABLE IF EXISTS ' + db_table_name
    create_table = 'CREATE TABLE ' + db_table_name + " (" + column_dtype + ")"
    insert_into_table = 'INSERT INTO ' + db_table_name + '(' + dataframe_columns_insertable + ')' +' VALUES ( %s' % ', '.join(['%s'] * len(dataframe.columns)) +')'
    select_table = 'SELECT * FROM ' + db_table_name

    cursor.execute(drop_table)
    cursor.execute(create_table)
    # 3. create insert into function
    for index, row in dataframe.iterrows():
        cursor.execute(insert_into_table,row)

    conn.commit()

    # 4. Show table in DB
    cursor.execute(select_table)
    for each in cursor:
        print(each)



('New York', 'NY', 'New York', 36061, 'New York', 40.6943, -73.9249, 18713220, 10715, 'polygon', False, True, 'America/New_York', 1, 1840034016)
('Los Angeles', 'CA', 'California', 6037, 'Los Angeles', 34.1139, -118.4068, 12750807, 3276, 'polygon', False, True, 'America/Los_Angeles', 1, 1840020491)
('Chicago', 'IL', 'Illinois', 17031, 'Cook', 41.8373, -87.6862, 8604203, 4574, 'polygon', False, True, 'America/Chicago', 1, 1840000494)
('Miami', 'FL', 'Florida', 12086, 'Miami-Dade', 25.7839, -80.2102, 6445545, 5019, 'polygon', False, True, 'America/New_York', 1, 1840015149)
('Dallas', 'TX', 'Texas', 48113, 'Dallas', 32.7936, -96.7662, 5743938, 1526, 'polygon', False, True, 'America/Chicago', 1, 1840019440)
('Philadelphia', 'PA', 'Pennsylvania', 42101, 'Philadelphia', 40.0077, -75.1339, 5649300, 4554, 'polygon', False, True, 'America/New_York', 1, 1840000673)
('Houston', 'TX', 'Texas', 48201, 'Harris', 29.7863, -95.3889, 5464251, 1399, 'polygon', False, True, 'America/Chicago', 1, 18400209