# ETL - Azure to Stage

Mit dem script werden die aktuellsten Daten in der Azure kuzu DB geladen und in eine locale Datenbank geschrieben.Es findet noch keine Trasformation oder filterung der Daten statt.

In [22]:
import pyodbc
import pandas as pd
from sqlalchemy import create_engine, text, inspect, types
from sqlalchemy_utils import database_exists, create_database
import sqlalchemy
import os
from dotenv import load_dotenv

In [3]:
## Load .env file
load_dotenv('../config/.env')

True

## Load data from Azure DB

In [4]:
## Import credentials for kuzu Azure DB from .env file
SERVER_AZURE = os.environ.get('SERVER_AZURE', "default")  
DATABASE_AZURE = os.environ.get('DATABASE_AZURE', "default")
USERNAME_AZURE = os.environ.get('USERNAME_AZURE', "default")
PASSWORD_AZURE = os.environ.get('PASSWORD_AZURE', "default")
DRIVER_AZURE = os.environ.get('DRIVER_AZURE', "default")

In [5]:
cnxn = pyodbc.connect('DRIVER='+DRIVER_AZURE+';SERVER='+SERVER_AZURE+';DATABASE='+DATABASE_AZURE+';UID='+USERNAME_AZURE+';PWD='+ PASSWORD_AZURE)
cursor = cnxn.cursor()

In [6]:
#Sample select query
table_names = [x[2] for x in cursor.tables(tableType='TABLE')]
print(table_names)

['Antwort', 'Bahnhof', 'BahnhofUnique', 'BahnhofZuBahnhofUnique', 'Frage', 'Frageversion', 'FrageZuGruppe', 'Gruppe', 'Ortsgewicht', 'Ortskonzept', 'Region', 'Teilnehmer', 'Umfrage', 'WegPunkt', 'WegStrecke', 'trace_xe_action_map', 'trace_xe_event_map']


### Get each table from Azure DB

In [7]:
#Bahnhof = pd.read_sql("SELECT * FROM Bahnhof",cnxn)
#BahnhofUnique = pd.read_sql("SELECT * FROM BahnhofUnique",cnxn)
#BahnhofZuBahnhofUnique = pd.read_sql("SELECT * FROM BahnhofZuBahnhofUnique",cnxn)
Frage = pd.read_sql("SELECT * FROM Frage",cnxn)
Frageversion = pd.read_sql("SELECT * FROM Frageversion",cnxn)
FrageZuGruppe = pd.read_sql("SELECT * FROM FrageZuGruppe",cnxn)
Gruppe = pd.read_sql("SELECT * FROM Gruppe",cnxn)
#Ortsgewicht = pd.read_sql("SELECT * FROM Ortsgewicht",cnxn)
#Ortskonzept = pd.read_sql("SELECT * FROM Ortskonzept",cnxn)
#Region = pd.read_sql("SELECT * FROM Region",cnxn)
#Umfrage = pd.read_sql("SELECT * FROM Umfrage",cnxn)
#Wegpunkt = pd.read_sql("SELECT * FROM WegPunkt",cnxn)
#WegStrecke = pd.read_sql("SELECT * FROM WegStrecke",cnxn)

### Select relevant data to export

#### Kuzu Digital

In [20]:
#UmfrageName = 'kuzu_digital'

In [21]:
#q = f"""SELECT FrageCode FROM Frage WHERE UmfrageName LIKE '{UmfrageName}';""" # We want all FrageCodes here
#cols =  pd.read_sql(q, con=cnxn)
#col_list =  cols.FrageCode.values.tolist()
## add manual cols of interest
#col_list.insert(0, "file_name")
#col_list.insert(0, "UmfrageName")
#col_list.insert(0, "participant")
#col_list.insert(0, "time")

In [22]:
#cols = ', '.join(col_list)

In [23]:
#query = f"""SELECT {cols} FROM Teilnehmer WHERE UmfrageName LIKE '{UmfrageName}';"""

In [24]:
#kuzu_digital =  pd.read_sql(query , con=cnxn)

#### Kuzu Zug

In [8]:
UmfrageName = 'kuzu_zug'

In [9]:
q = f"""SELECT FrageCode FROM Frage WHERE UmfrageName LIKE '{UmfrageName}';""" # We want all FrageCodes here
cols =  pd.read_sql(q, con=cnxn)
col_list =  cols.FrageCode.values.tolist()
# add manual cols of interest
col_list.insert(0, "file_name")
col_list.insert(0, "UmfrageName")
col_list.insert(0, "participant")
col_list.insert(0, "time")

In [10]:
cols = ', '.join(col_list)

In [11]:
query = f"""SELECT {cols} FROM Teilnehmer WHERE UmfrageName LIKE '{UmfrageName}';"""

In [12]:
kuzu_zug =  pd.read_sql(query , con=cnxn)

#### Kuzu Bahnhof

In [48]:
#UmfrageName = 'kuzu_bahnhof'

In [49]:
#q = f"""SELECT FrageCode FROM Frage WHERE UmfrageName LIKE '{UmfrageName}';""" # We want all FrageCodes here
#cols =  pd.read_sql(q, con=cnxn)
#col_list =  cols.FrageCode.values.tolist()
## add manual cols of interest
#col_list.insert(0, "file_name")
#col_list.insert(0, "UmfrageName")
#col_list.insert(0, "participant")
#col_list.insert(0, "time")

In [50]:
#cols = ', '.join(col_list)

In [51]:
#query = f"""SELECT {cols} FROM Teilnehmer WHERE UmfrageName LIKE '{UmfrageName}';"""

In [52]:
#kuzu_bahnhof =  pd.read_sql(query , con=cnxn)

## Write date into local database

In [23]:
## Import credentials for local DB from .env file
USERNAME_LOCAL = os.environ.get('USERNAME_LOCAL', "default")  
PASSWORD_LOCAL = os.environ.get('PASSWORD_LOCAL', "default")
ENDPOINT_LOCAL = os.environ.get('ENDPOINT_LOCAL', "default")
DATABASE_LOCAL = os.environ.get('DATABASE_LOCAL', "default")

In [24]:
# Create Database on local machine
conn_string = f'postgresql://{USERNAME_LOCAL}:{PASSWORD_LOCAL}@{ENDPOINT_LOCAL}/{DATABASE_LOCAL}'
dbEngine = sqlalchemy.create_engine(conn_string, connect_args={'connect_timeout': 10}, echo=False)

In [25]:
try:
    with dbEngine.connect() as con:
        con.execute("SELECT 1")
    print('engine is valid')
except Exception as e:
    print(f'Engine invalid: {str(e)}')

engine is valid


### Write each table in local database

In [26]:
def sqlinsert(df,tablename,db_engine,chunksize):
    df.to_sql(tablename,
              dbEngine,
              if_exists='replace',
              index=False,
              method='multi',
              chunksize=chunksize)

In [27]:
sqlinsert(df = kuzu_zug, tablename='kuzu_zug', db_engine=dbEngine, chunksize=1000)
sqlinsert(df = Frage, tablename='Frage', db_engine=dbEngine, chunksize=5000)
sqlinsert(df = Frageversion, tablename='Frageversion', db_engine=dbEngine, chunksize=5000)
sqlinsert(df = FrageZuGruppe, tablename='FrageZuGruppe', db_engine=dbEngine, chunksize=5000)
sqlinsert(df = Gruppe, tablename='Gruppe', db_engine=dbEngine, chunksize=5000)