### libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
import sqlalchemy as alc
import pandas as pd
import os


# add run method to engine
def run(self: alc.engine.Engine, sql: str) -> pd.DataFrame | None:
    with self.begin() as conn:
        res = conn.execute(alc.text(sql))
        if res.returns_rows:
            return pd.DataFrame(res.all(), columns=res.keys())


alc.engine.Engine.run = run

In [209]:
def tera_random_normal(mean: float, stddev: float) -> str:
    _random_01 = "CAST(RANDOM(1, 2147483647) AS FLOAT) / 2147483647.0"
    box_muller_transformation_z1 = f"{mean} + {stddev} * sqrt(-2 * ln({_random_01})) * cos(2 * 3.14159 * {_random_01})"
    return box_muller_transformation_z1

def make_bin_expression(col: str, bins: list[float], tabs: int = 0) -> str:
    """left close, right open"""
    expr = ""
    indent = "    " * tabs
    for i in range(len(bins)-1):
        lower = bins[i]
        upper = bins[i+1]
        if i == 0:
            expr += f"when {col} < {lower} then '[-Inf, {lower})'\n"
        expr += f"{indent}when {col} >= {lower} and {col} < {upper} then '[{lower},{upper})'\n"
    expr += f"{indent}when {col} >= {upper} then '[{upper},Inf)'"
    # expr += f"{indent}else 'other'"
    return expr

### spec

In [None]:
connection_string = f"teradatasql://demo_user:{os.environ['password']}@test-l36lujzkc0420a7n.env.clearscape.teradata.com"
eng = alc.create_engine(connection_string)
eng.run(f"select * from dbc.dbcinfo where infokey = 'Version'")

Unnamed: 0,InfoKey,InfoData
0,VERSION,20.00.24.60


### example analytical task

In [210]:
tbl = 'titanic'
bins = [0, 10, 20, 30, 100]

qry = f"""
create table {tbl}_2 as (
    with add_id as (
        select
            a.*,
            row_number() over (order by (select null)) as id
        from {tbl} a
    ),add_vars as (
        select
            a.*,
            avg(age) over (partition by who) as avg_age_in_who,
            count(1) over (partition by who order by fare, id asc rows between unbounded preceding and current row) as cumcount_in_who_by_fare,
            case
                {make_bin_expression('fare', bins = bins, tabs = 4)}
                end as fare_bin,
            greatest(
                fare + {tera_random_normal(0,10)},
                0
                ) as fare_adj,
            case
                {make_bin_expression('fare_adj', bins=bins, tabs = 4)}
                end as fare_adj_bin
        from add_id a
    )
    select
        id,
        who,
        survived,
        age,
        fare,
        fare_bin,
        fare_adj,
        fare_adj_bin,
        avg_age_in_who,
        cumcount_in_who_by_fare
    from add_vars a
    where fare > 5
) with data;
"""
print(qry)


create table titanic_2 as (
    with add_id as (
        select
            a.*,
            row_number() over (order by (select null)) as id
        from titanic a
    ),add_vars as (
        select
            a.*,
            avg(age) over (partition by who) as avg_age_in_who,
            count(1) over (partition by who order by fare, id asc rows between unbounded preceding and current row) as cumcount_in_who_by_fare,
            case
                when fare < 0 then '[-Inf, 0)'
                when fare >= 0 and fare < 10 then '[0,10)'
                when fare >= 10 and fare < 20 then '[10,20)'
                when fare >= 20 and fare < 30 then '[20,30)'
                when fare >= 30 and fare < 100 then '[30,100)'
                when fare >= 100 then '[100,Inf)'
                end as fare_bin,
            greatest(
                fare + 0 + 10 * sqrt(-2 * ln(CAST(RANDOM(1, 2147483647) AS FLOAT) / 2147483647.0)) * cos(2 * 3.14159 * CAST(RANDOM(1, 2147483647) AS FLOAT) / 214

In [207]:
try:
    _ = eng.run(f"drop table {tbl}_2")
except:
    pass

_ = eng.run(qry)

In [208]:
df = eng.run(f"select * from {tbl}_2")
print(df.head().to_string())

    id    who  survived   age     fare fare_bin   fare_adj fare_adj_bin  avg_age_in_who  cumcount_in_who_by_fare
0  767  child         1  15.0   7.2250   [0,10)   9.247702       [0,10)        6.369518                        1
1  102    man         0  38.0   7.8958   [0,10)  13.323215      [10,20)       33.173123                      148
2  553    man         0  28.0  13.5000  [10,20)   5.557223       [0,10)       33.173123                      325
3  555  woman         0   NaN   8.0500   [0,10)  11.587004      [10,20)       32.000000                       47
4  344  child         0  15.0   7.2292   [0,10)   4.881860       [0,10)        6.369518                        2


In [203]:
qry = f"""
with x as (
    select
        who,
        count(1) as n,
        avg(age) as age,
        max(avg_age_in_who) as max_avg_age_in_who
    from {tbl}_2
    group by 1
)
select
    x.*,
    case when max_avg_age_in_who - age < 0.01 then 'yes' else 'no' end as equal
from x
"""

df = eng.run(qry)
df

Unnamed: 0,who,n,age,max_avg_age_in_who,equal
0,woman,271,32.0,32.0,yes
1,child,83,6.369518,6.369518,yes
2,man,520,33.17203,33.173123,yes


In [204]:
qry = f"""
with x as (
    select
        id,
        who,
        fare,
        cumcount_in_who_by_fare
    from {tbl}_2
    where who = 'child'
)
select
    x.*
from x
order by fare asc, id asc
"""

df = eng.run(qry)
df

Unnamed: 0,id,who,fare,cumcount_in_who_by_fare
0,832,child,7.2250,1
1,344,child,7.2292,2
2,737,child,7.2292,3
3,11,child,7.8542,4
4,18,child,8.0292,5
...,...,...,...,...
78,427,child,120.0000,79
79,759,child,120.0000,80
80,289,child,151.5500,81
81,297,child,151.5500,82


In [205]:
qry = f"""
with x as (
    select
        fare,
        fare_bin,
        fare_adj,
        fare_adj_bin
    from {tbl}_2
    where fare_bin <> fare_adj_bin
)
select
    x.*
from x
"""

df = eng.run(qry)
df

Unnamed: 0,fare,fare_bin,fare_adj,fare_adj_bin
0,7.2250,"[0,10)",25.352366,"[20,30)"
1,8.0500,"[0,10)",11.722661,"[10,20)"
2,24.0000,"[20,30)",4.823176,"[0,10)"
3,13.0000,"[10,20)",21.102933,"[20,30)"
4,7.8542,"[0,10)",10.131328,"[10,20)"
...,...,...,...,...
361,7.9250,"[0,10)",15.808632,"[10,20)"
362,7.9250,"[0,10)",11.440799,"[10,20)"
363,7.9250,"[0,10)",28.404336,"[20,30)"
364,8.0500,"[0,10)",13.362860,"[10,20)"


### make table

In [None]:
# import seaborn as sns

# df = sns.load_dataset('titanic')
# df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
# qry = f"""
# create table titanic(
#     survived INT,
#     pclass INT,
#     sex VARCHAR(10),
#     age FLOAT,
#     sibsp INT,
#     parch INT,
#     fare FLOAT,
#     embarked VARCHAR(10),
#     class_ VARCHAR(10),
#     who VARCHAR(10),
#     adult_male BYTEINT,
#     deck VARCHAR(10),
#     embark_town VARCHAR(20),
#     alive VARCHAR(10),
#     alone BYTEINT
# );
# """

# _ = eng.run(qry)

In [None]:
# df.rename(columns={'class': 'class_'}).to_sql('titanic', eng, if_exists='append', index=False)

1

In [None]:
# eng.run("select count(*) as n from titanic").iloc[0,0]

np.int64(891)

In [None]:
# res = eng.run("select top 5 * from titanic")
# res.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class_,who,adult_male,deck,embark_town,alive,alone
0,1,3,female,26.0,0,0,7.925,S,Third,woman,0,,Southampton,yes,1
1,0,3,male,35.0,0,0,8.05,S,Third,man,1,,Southampton,no,1
2,1,1,female,35.0,1,0,53.1,S,First,woman,0,C,Southampton,yes,0
3,1,1,female,38.0,1,0,71.2833,C,First,woman,0,C,Cherbourg,yes,0
4,0,3,male,22.0,1,0,7.25,S,Third,man,1,,Southampton,no,0


In [None]:
# # vertical info view
# _ = pd.concat([res.head(1).T, res.dtypes, res.isna().sum()], axis=1)
# _.columns = ['example_value', 'dtypes', 'n_null']
# _