In [4]:
#Insert new data from /data into the remote database
import os
import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, Float, inspect, Table
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

# Remote database connection URL
REMOTE_DATABASE_URL = "postgresql://udvgghkgvhno05:pdb4ac0e3d9341327385cc1514256906b3ea3cee34a8bfca9f9878e3ecb829408@c6i386kdr73gcp.cluster-czz5s0kz4scl.eu-west-1.rds.amazonaws.com:5432/d3p1stj7ab710b"

# Directory containing the CSV files
csv_directory = "data/"

# Create a connection to the remote database
engine = create_engine(REMOTE_DATABASE_URL)
Base = declarative_base()

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

# Function to create a table class dynamically based on the CSV file
def create_table_class(table_name, columns):
    class_name = f"Table_{table_name}"
    table_attributes = {"__tablename__": table_name, "id": Column(Integer, primary_key=True), "__table_args__": {"extend_existing": True}}

    for column_name, column_type in columns.items():
        if column_type == "int":
            table_attributes[column_name] = Column(Integer)
        elif column_type == "float":
            table_attributes[column_name] = Column(Float)
        else:
            table_attributes[column_name] = Column(String)

    return type(class_name, (Base,), table_attributes)

# Iterate over the CSV files in the directory
for filename in os.listdir(csv_directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(csv_directory, filename)
        table_name = os.path.splitext(filename)[0]

        # Check if the table already exists in the database
        inspector = inspect(engine)
        if table_name in inspector.get_table_names():
            # Check if the table is empty
            table = Table(table_name, Base.metadata, autoload_with=engine)
            table_empty = session.query(table).count() == 0

            if not table_empty:
                print(f"Table {table_name} already exists and is not empty. Skipping insertion.")
                continue

        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(file_path)

        # Replace "-" values with "0"
        df = df.replace("-", "0")

        # Determine the column types based on the DataFrame
        column_types = {}
        for column_name, column_data in df.items():
            if column_data.dtype == "int64":
                column_types[column_name] = "int"
            elif column_data.dtype == "float64":
                column_types[column_name] = "float"
            else:
                column_types[column_name] = "str"

        # Create the table class dynamically
        TableClass = create_table_class(table_name, column_types)

        # Create the table in the database (if it doesn't exist)
        Base.metadata.create_all(engine)

        # Clear the existing data from the table (if any)
        session.query(TableClass).delete()
        session.commit()

        # Insert the data from the DataFrame into the table
        for _, row in df.iterrows():
            table_instance = TableClass(**row.to_dict())
            session.add(table_instance)

        session.commit()
        print(f"Inserted data from {filename} into table {table_name}")

session.close()

  Base = declarative_base()


Table census2022-uv205-ctry already exists and is not empty. Skipping insertion.
Table census2022-uv205-ca already exists and is not empty. Skipping insertion.
Table census2022-uv205-oa already exists and is not empty. Skipping insertion.
Inserted data from census2022-uv205-izn.csv into table census2022-uv205-izn


In [28]:
# Create local metadata schema
import json

# Provided JSON data
json_example = {
    "TS021": {
        "name": "Ethnic group",
        "table_names": [
            "census2021-ts021-msoa",
            "census2021-ts021-ltla",
            "census2021-ts021-ctry",
            "census2021-ts021-oa",
            "census2021-ts021-rgn",
            "census2021-ts021-utla",
            "census2021-ts021-lsoa"
        ]
    },
    "TS022": {
        "name": "Ethnic group (detailed)",
        "table_names": [
            "census2021-ts022-rgn",
            "census2021-ts022-utla",
            "census2021-ts022-ctry",
            "census2021-ts022-ltla",
            "census2021-ts022-msoa"
        ]
    },
    "TS030": {
        "name": "Religion",
        "table_names": [
            "census2021-ts030-msoa",
            "census2021-ts030-rgn",
            "census2021-ts030-ltla",
            "census2021-ts030-ctry",
            "census2021-ts030-utla",
            "census2021-ts030-lsoa",
            "census2021-ts030-oa"
        ]
    },
    "TS031": {
        "name": "Religion (detailed)",
        "table_names": [
            "census2021-ts031-msoa",
            "census2021-ts031-ltla",
            "census2021-ts031-ctry",
            "census2021-ts031-rgn",
            "census2021-ts031-utla"
        ]
    }
}

json_to_update = {
    "TS001": {
        "name": "Number of usual residents in households and communal establishments",
        "table_names": []
    },
    "TS002": {
        "name": "Legal partnership status",
        "table_names": []
    },
    "TS003": {
        "name": "Household composition",
        "table_names": []
    },
    "TS004": {
        "name": "Country of birth",
        "table_names": []
    },
    "TS005": {
        "name": "Passports held",
        "table_names": []
    },
    "TS006": {
        "name": "Population density",
        "table_names": []
    },
    "TS007": {
        "name": "Age by single year of age",
        "table_names": []
    },
    "TS007A": {
        "name": "Age by five-year age bands",
        "table_names": []
    },
    "TS008": {
        "name": "Sex",
        "table_names": []
    },
    "TS009": {
        "name": "Sex by single year of age",
        "table_names": []
    },
    "TS010": {
        "name": "Living arrangements",
        "table_names": []
    },
    "TS011": {
        "name": "Households by deprivation dimensions",
        "table_names": []
    },
    "TS012": {
        "name": "Country of birth (detailed)",
        "table_names": []
    },
    "TS013": {
        "name": "Passports held (detailed)",
        "table_names": []
    },
    "TS015": {
        "name": "Year of arrival in UK",
        "table_names": []
    },
    "TS016": {
        "name": "Length of residence",
        "table_names": []
    },
    "TS017": {
        "name": "Household size",
        "table_names": []
    },
    "TS018": {
        "name": "Age of arrival in the UK",
        "table_names": []
    },
    "TS019": {
        "name": "Migrant Indicator",
        "table_names": []
    },
    "TS020": {
        "name": "Number of non-UK short-term residents by sex",
        "table_names": []
    },
    "TS041": {
        "name": "Number of Households",
        "table_names": []
    },
    "TS021": {
        "name": "Ethnic group",
        "table_names": [
            "census2021-ts021-msoa",
            "census2021-ts021-ltla",
            "census2021-ts021-ctry",
            "census2021-ts021-oa",
            "census2021-ts021-rgn",
            "census2021-ts021-utla",
            "census2021-ts021-lsoa"
        ]
    },
    "TS022": {
        "name": "Ethnic group (detailed)",
        "table_names": [
            "census2021-ts022-rgn",
            "census2021-ts022-utla",
            "census2021-ts022-ctry",
            "census2021-ts022-ltla",
            "census2021-ts022-msoa"
        ]
    },
    "TS023": {
        "name": "Multiple ethnic group",
        "table_names": []
    },
    "TS024": {
        "name": "Main language (detailed)",
        "table_names": []
    },
    "TS025": {
        "name": "Household language",
        "table_names": []
    },
    "TS026": {
        "name": "Multiple main languages in households",
        "table_names": []
    },
    "TS027": {
        "name": "National identity - UK",
        "table_names": []
    },
    "TS028": {
        "name": "National identity (detailed)",
        "table_names": []
    },
    "TS029": {
        "name": "Proficiency in english",
        "table_names": []
    },
    "TS030": {
        "name": "Religion",
        "table_names": []
    },
    "TS031": {
        "name": "Religion (detailed)",
        "table_names": []
    },
    "TS075": {
        "name": "Multi religion households",
        "table_names": []
    }
}

# Function to generate table_names
def generate_table_names(key):
    areas = ["msoa", "ltla", "ctry", "oa", "rgn", "utla", "lsoa"]
    return [f"census2021-{key.lower()}-{area}" for area in areas]

# Update json_to_update with table_names
for key in json_to_update:
    if not json_to_update[key]["table_names"]:
        json_to_update[key]["table_names"] = generate_table_names(key)

# Print updated json_to_update
print(json.dumps(json_to_update, indent=2))

{
  "TS001": {
    "name": "Number of usual residents in households and communal establishments",
    "table_names": [
      "census2021-ts001-msoa",
      "census2021-ts001-ltla",
      "census2021-ts001-ctry",
      "census2021-ts001-oa",
      "census2021-ts001-rgn",
      "census2021-ts001-utla",
      "census2021-ts001-lsoa"
    ]
  },
  "TS002": {
    "name": "Legal partnership status",
    "table_names": [
      "census2021-ts002-msoa",
      "census2021-ts002-ltla",
      "census2021-ts002-ctry",
      "census2021-ts002-oa",
      "census2021-ts002-rgn",
      "census2021-ts002-utla",
      "census2021-ts002-lsoa"
    ]
  },
  "TS003": {
    "name": "Household composition",
    "table_names": [
      "census2021-ts003-msoa",
      "census2021-ts003-ltla",
      "census2021-ts003-ctry",
      "census2021-ts003-oa",
      "census2021-ts003-rgn",
      "census2021-ts003-utla",
      "census2021-ts003-lsoa"
    ]
  },
  "TS004": {
    "name": "Country of birth",
    "table_names": 

In [29]:
# Should be adjusted to create metadata schema
import csv
import json

# Read the CSV file and extract unique geography values
unique_geographies = set()
with open('data/SC/census2022-uv205/census2022-uv205-ca.csv', 'r') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        unique_geographies.add(row['geography'])

# Create a dictionary with the formatted output
output = {
    "Scotland": sorted(unique_geographies)
}

# Convert the dictionary to JSON format
json_output = json.dumps(output, indent=4)

# Print the formatted output
print(json_output)

{
    "Scotland": [
        "Aberdeen City",
        "Aberdeenshire",
        "Angus",
        "Argyll and Bute",
        "City of Edinburgh",
        "Clackmannanshire",
        "Dumfries and Galloway",
        "Dundee City",
        "East Ayrshire",
        "East Dunbartonshire",
        "East Lothian",
        "East Renfrewshire",
        "Falkirk",
        "Fife",
        "Glasgow City",
        "Highland",
        "Inverclyde",
        "Midlothian",
        "Moray",
        "Na h-Eileanan Siar",
        "North Ayrshire",
        "North Lanarkshire",
        "Orkney Islands",
        "Perth and Kinross",
        "Renfrewshire",
        "Scottish Borders",
        "Shetland Islands",
        "South Ayrshire",
        "South Lanarkshire",
        "Stirling",
        "West Dunbartonshire",
        "West Lothian"
    ]
}


In [33]:
import pandas as pd

# Load the temp.csv file
temp_df = pd.read_csv('data/SC/census2022-uv205/temp.csv')

# Load the census2022-uv205-ca.csv file
census_df = pd.read_csv('data/SC/census2022-uv205/census2022-uv205-ca.csv')

# Check if the "geography code" column exists in temp_df
if 'geography code' in temp_df.columns:
    # Add the "geography code" column to census_df
    census_df["geography code"] = temp_df["geography code"]
else:
    print("The column 'geography code' does not exist in temp.csv")

# Save the updated census_df to a new CSV file
census_df.to_csv('data/SC/census2022-uv205/updated_census2022-uv205-ca.csv', index=False)

print("The 'geography code' column has been copied successfully.")


The 'geography code' column has been added to the census2022-uv205-ca table in the remote database successfully.


In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection parameters
REMOTE_DATABASE_URL = "postgresql://udvgghkgvhno05:pdb4ac0e3d9341327385cc1514256906b3ea3cee34a8bfca9f9878e3ecb829408@c6i386kdr73gcp.cluster-czz5s0kz4scl.eu-west-1.rds.amazonaws.com:5432/d3p1stj7ab710b"

# Load the temp.csv file
temp_df = pd.read_csv('data/SC/census2022-uv205/temp.csv')

# Ensure the "geography code" column exists
if 'geography code' not in temp_df.columns:
    raise ValueError("The column 'geography code' does not exist in temp.csv")

# Connect to the remote PostgreSQL database
engine = create_engine(REMOTE_DATABASE_URL)
connection = engine.connect()

# Load the census2022-uv205-ca table from the database
census_df = pd.read_sql_table('census2022-uv205-ca', connection)

# Ensure the dataframes have the same number of rows
if len(temp_df) != len(census_df):
    raise ValueError("The number of rows in temp.csv does not match the number of rows in census2022-uv205-ca")

# Add the "geography code" column to the census dataframe
census_df['geography code'] = temp_df['geography code']

# Update the census2022-uv205-ca table in the database
census_df.to_sql('census2022-uv205-ca', connection, if_exists='replace', index=False)

# Close the connection
connection.close()

print("The 'geography code' column has been added to the census2022-uv205-ca table in the remote database successfully.")


In [5]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection parameters
REMOTE_DATABASE_URL = "postgresql://udvgghkgvhno05:pdb4ac0e3d9341327385cc1514256906b3ea3cee34a8bfca9f9878e3ecb829408@c6i386kdr73gcp.cluster-czz5s0kz4scl.eu-west-1.rds.amazonaws.com:5432/d3p1stj7ab710b"

# Load the OA_TO_HIGHER_AREAS.csv file
oa_to_higher_areas_df = pd.read_csv('data/SC/OA_TO_HIGHER_AREAS.csv')

# Ensure the columns "OA2022" and "CA2019" exist
if 'OA2022' not in oa_to_higher_areas_df.columns or 'CA2019' not in oa_to_higher_areas_df.columns:
    raise ValueError("The required columns 'OA2022' and 'CA2019' do not exist in OA_TO_HIGHER_AREAS.csv")

# Rename the columns to match the desired names
oa_to_higher_areas_df = oa_to_higher_areas_df.rename(columns={'OA2022': 'oa', 'CA2019': 'ca'})
print(oa_to_higher_areas_df.head())

# Connect to the remote PostgreSQL database
engine = create_engine(REMOTE_DATABASE_URL)
connection = engine.connect()

# Load the geography_mappings table from the database
geography_mappings_df = pd.read_sql_table('geography_mappings', connection)

# Append the new rows from oa_to_higher_areas_df to geography_mappings_df
appended_df = pd.concat([geography_mappings_df, oa_to_higher_areas_df[['oa', 'ca']]], ignore_index=True)

# Update the geography_mappings table in the database with the new rows appended
appended_df.to_sql('geography_mappings', connection, if_exists='replace', index=False)

# Close the connection
connection.close()

print("The new rows from 'oa' and 'ca' have been added to the geography_mappings table in the remote database successfully.")


          oa         ca     EW2022     DZ2011     CP1930    SPC2021  \
0  S00135307  S12000033  S13002835  S01006755  S35000272  S16000075   
1  S00135308  S12000033  S13002835  S01006755  S35000272  S16000075   
2  S00135309  S12000033  S13002836  S01006788  S35000669  S16000075   
3  S00135310  S12000033  S13002835  S01006755  S35000272  S16000075   
4  S00135311  S12000033  S13002835  S01006762  S35000669  S16000075   

     HBA2019  CSETT2022   CLOC2022   UKPC2005  ... NP2010     IZ2011 ITL2_21  \
0  S08000020        NaN        NaN  S14000037  ...    NaN  S02001278    TLM5   
1  S08000020        NaN        NaN  S14000037  ...    NaN  S02001278    TLM5   
2  S08000020        NaN        NaN  S14000037  ...    NaN  S02001284    TLM5   
3  S08000020  S53000002  S52000220  S14000037  ...    NaN  S02001278    TLM5   
4  S08000020        NaN        NaN  S14000037  ...    NaN  S02001280    TLM5   

  ITL3_21 SIMD2020   MasterPC Easting  Northing     Hect       SqKM  
0   TLM50     4563   A

In [6]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection parameters
REMOTE_DATABASE_URL = "postgresql://udvgghkgvhno05:pdb4ac0e3d9341327385cc1514256906b3ea3cee34a8bfca9f9878e3ecb829408@c6i386kdr73gcp.cluster-czz5s0kz4scl.eu-west-1.rds.amazonaws.com:5432/d3p1stj7ab710b"

# Connect to the remote PostgreSQL database
engine = create_engine(REMOTE_DATABASE_URL)
connection = engine.connect()

# Load the census2022-uv205-ca table from the database
census_df = pd.read_sql_table('census2022-uv205-ca', connection)

# Load the geography_mappings table from the database
geography_mappings_df = pd.read_sql_table('geography_mappings', connection)

# Merge the geography_mappings table with the census2022-uv205-ca table on "ca" and "geography code"
merged_df = geography_mappings_df.merge(census_df[['geography code', 'geography']], how='left', left_on='ca', right_on='geography code')

# Rename the "geography" column to "ca_nm"
merged_df.rename(columns={'geography': 'ca_nm'}, inplace=True)

# Drop the unnecessary "geography code" column
merged_df.drop(columns=['geography code'], inplace=True)

# Update the geography_mappings table in the database
merged_df.to_sql('geography_mappings', connection, if_exists='replace', index=False)

# Close the connection
connection.close()

print("The 'ca_nm' column has been added to the geography_mappings table in the remote database successfully.")


The 'ca_nm' column has been added to the geography_mappings table in the remote database successfully.


In [11]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection parameters
REMOTE_DATABASE_URL = "postgresql://udvgghkgvhno05:pdb4ac0e3d9341327385cc1514256906b3ea3cee34a8bfca9f9878e3ecb829408@c6i386kdr73gcp.cluster-czz5s0kz4scl.eu-west-1.rds.amazonaws.com:5432/d3p1stj7ab710b"

# Load the local files
data_zone_df = pd.read_csv('data/SC/DataZone2011lookup.csv', encoding='ISO-8859-1')
oa_to_higher_areas_df = pd.read_csv('data/SC/OA_TO_HIGHER_AREAS.csv')

# Merge the local files to create a mapping
mapping_df = oa_to_higher_areas_df.merge(data_zone_df, how='left', left_on='DZ2011', right_on='DZ2011_Code')

# Connect to the remote PostgreSQL database
engine = create_engine(REMOTE_DATABASE_URL)
connection = engine.connect()

# Load the geography_mappings table from the database
geography_mappings_df = pd.read_sql_table('geography_mappings', connection)

# Merge the geography_mappings table with the mapping dataframe on 'oa'
geography_mappings_df = geography_mappings_df.merge(mapping_df[['OA2022', 'DZ2011_Code', 'DZ2011_Name']], how='left', left_on='oa', right_on='OA2022')

# Rename the columns
geography_mappings_df.rename(columns={'DZ2011_Code': 'dz', 'DZ2011_Name': 'dz_nm'}, inplace=True)

# Drop the unnecessary 'OA2022' column
geography_mappings_df.drop(columns=['OA2022'], inplace=True)

# Update the geography_mappings table in the database
geography_mappings_df.to_sql('geography_mappings', connection, if_exists='replace', index=False)

# Close the connection
connection.close()

print("The 'dz' and 'dz_nm' columns have been added to the geography_mappings table in the remote database successfully.")


The 'dz' and 'dz_nm' columns have been added to the geography_mappings table in the remote database successfully.


In [2]:
import pandas as pd
from sqlalchemy import create_engine, text

# Database connection parameters
REMOTE_DATABASE_URL = "postgresql://udvgghkgvhno05:pdb4ac0e3d9341327385cc1514256906b3ea3cee34a8bfca9f9878e3ecb829408@c6i386kdr73gcp.cluster-czz5s0kz4scl.eu-west-1.rds.amazonaws.com:5432/d3p1stj7ab710b"

# Load the local files with specified encoding
data_zone_df = pd.read_csv('data/SC/DataZone2011lookup.csv', encoding='ISO-8859-1')
oa_to_higher_areas_df = pd.read_csv('data/SC/OA_TO_HIGHER_AREAS.csv')

# Merge the local files to create a mapping
mapping_df = oa_to_higher_areas_df.merge(data_zone_df, how='left', left_on='IZ2011', right_on='IZ2011_Code')

# Select only the necessary columns
mapping_df = mapping_df[['OA2022', 'IZ2011_Code', 'IZ2011_Name']]

# Connect to the remote PostgreSQL database
engine = create_engine(REMOTE_DATABASE_URL)
connection = engine.connect()

# Load the geography_mappings table from the database
geography_mappings_df = pd.read_sql_table('geography_mappings', connection)

# Merge the geography_mappings table with the mapping dataframe on 'oa'
geography_mappings_df = geography_mappings_df.merge(mapping_df[['OA2022', 'IZ2011_Code', 'IZ2011_Name']], how='left', left_on='oa', right_on='OA2022')

# Rename the columns
geography_mappings_df.rename(columns={'IZ2011_Code': 'iz', 'IZ2011_Name': 'iz_nm'}, inplace=True)

# Drop the unnecessary 'OA2022' column
geography_mappings_df.drop(columns=['OA2022'], inplace=True)

# Update the geography_mappings table in the database
geography_mappings_df.to_sql('geography_mappings', connection, if_exists='replace', index=False)

# Close the connection
connection.close()

print("The 'iz' and 'iz_nm' columns have been added to the geography_mappings table in the remote database successfully.")


The 'iz' and 'iz_nm' columns have been added to the geography_mappings table in the remote database successfully.
