In [36]:
import os
import pandas as pd
import psycopg2
from psycopg2 import sql
import time  # Import the time module
import dotenv
from sqlalchemy import create_engine
from pathlib import Path

# Assuming the script is run from within the 'functions' directory
ROOT = Path().resolve().parent

# Load environment variables
dotenv.load_dotenv()

# Print the path of the .env file being accessed
env_path = os.path.join(os.getcwd(), '.env')

# Database connection parameters
DATABASE_URL = os.getenv('DATABASE_URI')

# Correct the DATABASE_URL if necessary
if DATABASE_URL.startswith('postgres://'):
    DATABASE_URL = DATABASE_URL.replace('postgres://', 'postgresql://', 1)

# Create a SQLAlchemy engine
engine = create_engine(DATABASE_URL)

# Function to clean column names
def clean_column_names(column_name):
    # Remove all text before the first colon (including the colon)
    if ':' in column_name:
        column_name = column_name.split(':', 1)[1]
    # Remove text after a semicolon (including the semicolon)
    if ';' in column_name:
        column_name = column_name.split(';', 1)[0]
    # Remove leading and trailing whitespace
    column_name = column_name.strip()
    return column_name

# Path to the directory containing subdirectories with CSV files
base_path = ROOT / 'data' / 'EW'
print(f"Base path: {base_path}")

# Columns to skip
skip_columns = {'geography', 'geography code', 'date'}  # Removed 'id' from skip_columns

# Iterate over subdirectories and files
for subdir, dirs, files in os.walk(base_path):
    print(f"Processing directory: {subdir}")
    for file in files:
        if file.endswith('.csv'):
            table_name = file[:-4]  # Assuming the file name matches the table name
            file_path = os.path.join(subdir, file)
            # Read the CSV file
            df = pd.read_csv(file_path)
            # Clean column names
            df.columns = [clean_column_names(col) for col in df.columns]

            # Check if 'id' is in DataFrame, if not, add it as the first column with NaN values
            if 'id' not in df.columns:
                df.insert(0, 'id', pd.NA)

            # Reorder columns
            fixed_columns = ['id', 'geography code', 'geography', 'date']
            new_columns_order = fixed_columns + [col for col in df.columns if col not in fixed_columns]
            df = df[new_columns_order]

            # Print the table name and columns to be inserted
            print(f"Inserting data into table: {table_name}")
            print("Columns to be inserted:")
            print(df.columns.tolist())

            # Wait for 3 seconds before making changes
            time.sleep(3)

            # Insert data into the database, replacing the existing table
            df.to_sql(table_name, engine, if_exists='replace', index=False)

# Close the database connection
engine.dispose()

Base path: /Users/cardigan/llm-datawarehouse/data/EW
Processing directory: /Users/cardigan/llm-datawarehouse/data/EW
Processing directory: /Users/cardigan/llm-datawarehouse/data/EW/census2021-ts004
Inserting data into table: census2021-ts004-llta
Columns to be inserted:
['id', 'geography code', 'geography', 'date', 'Total', 'Europe', 'Europe: United Kingdom', 'Europe: EU countries', 'Europe: EU countries: European Union EU14', 'Europe: EU countries: European Union EU8', 'Europe: EU countries: European Union EU2', 'Europe: EU countries: All other EU countries', 'Europe: Non-EU countries', 'Europe: Non-EU countries: All other non-EU countries', 'Africa', 'Middle East and Asia', 'The Americas and the Caribbean', 'Antarctica and Oceania (including Australasia) and Other', 'British Overseas']


KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import psycopg2
from psycopg2 import sql

# Database connection parameters
DATABASE_URL = os.getenv('REMOTE_DATABASE_URL')

# Function to clean column names
def clean_column_names(column_name):
    # Remove all text before the first colon (including the colon)
    if ':' in column_name:
        column_name = column_name.split(':', 1)[1]
    # Remove text after a semicolon (including the semicolon)
    if ';' in column_name:
        column_name = column_name.split(';', 1)[0]
    # Remove leading and trailing whitespace
    column_name = column_name.strip()
    return column_name

# Connect to the database
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()

# Path to the directory containing subdirectories with CSV files
base_path = 'data/EW'

# Iterate over subdirectories and files
for subdir, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith('.csv'):
            table_name = file[:-4]  # Assuming the file name matches the table name
            file_path = os.path.join(subdir, file)
            # Read the CSV file
            df = pd.read_csv(file_path)
            # Clean column names
            new_columns = [clean_column_names(col) for col in df.columns]
            # Rename columns in the dataframe
            df.columns = new_columns

            # Generate SQL for renaming columns in the database
            for old_col, new_col in zip(df.columns, new_columns):
                if old_col != new_col:
                    rename_column_query = sql.SQL("ALTER TABLE {table} RENAME COLUMN {old_col} TO {new_col};").format(
                        table=sql.Identifier(table_name),
                        old_col=sql.Identifier(old_col),
                        new_col=sql.Identifier(new_col)
                    )
                    cur.execute(rename_column_query)
                    conn.commit()

# Close the database connection
cur.close()
conn.close()

In [6]:
import psycopg2
from psycopg2 import sql
import bcrypt

# Database connection parameters
DATABASE_URL = "postgres://ewhpdfzafeyjhb:4a16e046ac86cb07d825eb32b4223a542c872f8df232e1dad0656916c426c2fc@ec2-52-31-2-97.eu-west-1.compute.amazonaws.com:5432/d35kfnbqn0jb3t"

# Connect to the database
conn = psycopg2.connect(DATABASE_URL, sslmode='require')
cur = conn.cursor()

# Create the 'users' table with a larger password column
create_table_query = """
CREATE TABLE IF NOT EXISTS users (
    username VARCHAR(50) PRIMARY KEY,
    password VARCHAR(60) NOT NULL
);
"""
cur.execute(create_table_query)
conn.commit()

# Hash the password
password = 'password'
hashed_password = bcrypt.hashpw(password.encode('utf-8'), bcrypt.gensalt())

# Insert entries into the 'users' table
insert_query = """
INSERT INTO users (username, password) VALUES (%s, %s)
ON CONFLICT (username) DO NOTHING;
"""
cur.execute(insert_query, ('admin', hashed_password.decode('utf-8')))
conn.commit()

# Verify the insertion
cur.execute("SELECT * FROM users;")
rows = cur.fetchall()
for row in rows:
    print(row)

# Close the cursor and connection
cur.close()
conn.close()

('admin', '$2b$12$DZY95gDJ.4VYwer9diKnfumGfksDDEKJlUn8YBtoVXg4NUYeevyei')


In [None]:
import csv

with open('data/Scotland/OA_TO_HIGHER_AREAS.csv', 'r') as file:
    reader = csv.reader(file)
    
    # Read header row to get column names
    headers = next(reader)
    
    # Initialize dictionary to store unique values for each column
    unique_values = {col: set() for col in headers}
    
    # Iterate over each row
    for row in reader:
        # Iterate over each column in the row
        for col, value in zip(headers, row):
            unique_values[col].add(value)

# Print number of unique values for each column            
for col, values in unique_values.items():
    print(f"{col}: {len(values)} unique values")