# 06. 여러 개의 값에 대한 조작

In [5]:
from sqlalchemy import create_engine
import pandas as pd

In [6]:
db = create_engine("postgresql+psycopg2://postgres:skt1faker@localhost:5432/PostgreSQL", 
     echo=False) 
def query(query_state, db=db):
    return pd.read_sql(query_state, db)

# 
# 
# 
### 6-1 문자열 연결
### ```CONCAT( A, B)``` : 문자열 A와 B를 연결

In [7]:
query("SELECT * FROM mst_user_location")

Unnamed: 0,user_id,pref_name,city_name
0,U001,서울특별시,강서구
1,U002,경기도수원시,장안구
2,U003,제주특별자치도,서귀포시


In [8]:
query("""
SELECT user_id, 
    CONCAT(pref_name, city_name)
FROM mst_user_location;""")

Unnamed: 0,user_id,concat
0,U001,서울특별시강서구
1,U002,경기도수원시장안구
2,U003,제주특별자치도서귀포시


# 
# 
# 
### 6-2 여러 개의 값 비교

In [9]:
query("SELECT * FROM quarterly_sales;")

Unnamed: 0,year,q1,q2,q3,q4
0,2015,82000,83000,78000.0,83000.0
1,2016,85000,85000,80000.0,81000.0
2,2017,92000,81000,,


### 분기별 매출 증감 판정
### ```SIGN( 매개변수 )``` : 매개변수가 양수 라면 1 / 0이라면 0 / 음수라면 -1


- 매출이 증가한 경우 '+' / 감소한 경우 '-'

In [10]:
query("""
SELECT year, q1, q2,
    CASE 
        WHEN q1 < q2 THEN '+'
        WHEN q1 = q2 THEN ' '
        ELSE '-' 
    END AS judge_q1_q2,
    q2 - q1 AS diff_q1_q2,
    SIGN(q2-q1) AS sign_q2_q1 
FROM quarterly_sales
ORDER BY year; """)

Unnamed: 0,year,q1,q2,judge_q1_q2,diff_q1_q2,sign_q2_q1
0,2015,82000,83000,+,1000,1.0
1,2016,85000,85000,,0,0.0
2,2017,92000,81000,-,-11000,-1.0


# 
### 연간 최대/최소 4분기 매출 찾기

### ```greatest()``` : 최대값
### ```least()``` : 최소값

In [11]:
query("""
SELECT year,
    greatest(q1, q2, q3, q4) AS greatest_sales,
    least(q1, q2, q3, q4) AS least_sales
FROm quarterly_sales ORDER BY year;""")

Unnamed: 0,year,greatest_sales,least_sales
0,2015,83000,78000
1,2016,85000,80000
2,2017,92000,81000


# 
### 연간 평균 4분기 매출 계산

In [13]:
query("""
SELECT year, 
    (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0))/4 AS average
FROM quarterly_sales ORDER BY year;""")

Unnamed: 0,year,average
0,2015,81500
1,2016,82750
2,2017,43250


In [14]:
# NULL이 아닌 컬럼많을 사용하여 평균계산
query(""" 
SELECT year,
    (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0)) /  
    (SIGN(COALESCE(q1, 0)) + SIGN(COALESCE(q2, 0)) + SIGN(COALESCE(q3, 0)) + SIGN(COALESCE(q4, 0))) AS average
FROM quarterly_sales ORDER BY year;""")

Unnamed: 0,year,average
0,2015,81500.0
1,2016,82750.0
2,2017,86500.0


# 
# 
# 
### 6-3 2개의 값 비율 계산

In [15]:
query("SELECT * FROM advertising_stats;")

Unnamed: 0,dt,ad_id,impressions,clicks
0,2017-04-01,1,100000,3000
1,2017-04-01,2,120000,1200
2,2017-04-01,3,500000,10000
3,2017-04-02,1,0,0
4,2017-04-02,2,130000,1400
5,2017-04-02,3,620000,15000


# 
### 정수 자료형의 데이터 나누기
#### **CTR** : 클릭률 (클릭 / 노출)

- PostgreSQL의 경우 정수를 나누면 소수점이 잘리므로, 명시적인 자료형 변환 필요

In [26]:
query(""" 
SELECT dt, ad_id, 
    CAST(clicks AS double precision) / impressions AS CTR,
    100.0 * clicks / impressions AS ctr_as_percent
FROM advertising_stats 
    WHERE dt = '2017-04-01' ORDER BY dt, ad_id;""")

# BigQuery : 
# clicks / impressions as CTR

Unnamed: 0,dt,ad_id,ctr,ctr_as_percent
0,2017-04-01,1,0.03,3.0
1,2017-04-01,2,0.01,1.0
2,2017-04-01,3,0.02,2.0


# 
### 0으로 나누는 것 피하기

### ```NULLIF( 변수, 조건 )``` : 변수가 조건에 해당되면 결측값으로 인식

In [28]:
query("""
SELECT dt, ad_id,
    CASE
        WHEN impressions > 0 THEN 100.0 * clicks / impressions 
        END AS ctr_as_percent_by_case,
        100.0 * clicks / NULLIF(impressions, 0) AS ctr_as_percent_by_null
FROM advertising_stats ORDER BY dt, ad_id""")

Unnamed: 0,dt,ad_id,ctr_as_percent_by_case,ctr_as_percent_by_null
0,2017-04-01,1,3.0,3.0
1,2017-04-01,2,1.0,1.0
2,2017-04-01,3,2.0,2.0
3,2017-04-02,1,,
4,2017-04-02,2,1.076923,1.076923
5,2017-04-02,3,2.419355,2.419355


# 
# 
# 
### 6-4 두 값의 거리 계산

### ```ABS()``` : 절댓값
### ```POWER()``` : 제곱
### ```SQRT()``` : 제곱근

In [37]:
query("SELECT * FROM location_2d;")

Unnamed: 0,x1,y1,x2,y2
0,0,0,2,2
1,3,5,1,2
2,5,3,2,1


In [38]:
query("""
SELECT sqrt(power(x1-x2, 2) + power(y1-y2, 2)) AS dist
FROM location_2d;""")

Unnamed: 0,dist
0,2.828427
1,3.605551
2,3.605551


# 
### ```POINT( x, y, ...)``` : point자료형 (**PostgreSQL만 가능**)
### ```POINT( x1, y1, ...) <-> POINT( x2, y2, ...)``` : 두 point간 거리 계산

In [40]:
query("""
SELECT
    point(x1, y1) <-> point(x2, y2) AS dist
FROM location_2d;""")

Unnamed: 0,dist
0,2.828427
1,3.605551
2,3.605551


# 
# 
# 
### 6-5 날짜 / 시간 계산

In [43]:
query("SELECT * FROM mst_users_with_dates;")

Unnamed: 0,user_id,register_stamp,birth_date
0,U001,2016-02-28 10:00:00,2000-02-29
1,U002,2016-02-29 10:00:00,2000-02-29
2,U003,2016-03-01 10:00:00,2000-02-29


# 
### PostgreSQL의 경우 interval 자료형의 데이터에 사칙 연산 사용
### ```::timestamp``` / ```::interval``` / ```::date``` : timestamp, interval, date형으로 인식

- 1시간 후 / 30분 전의 시간 / 다음날 / 1달 전의 날짜

In [49]:
query(""" 
SELECT user,
    register_stamp::timestamp AS register_stamp,
    (register_stamp::timestamp + '1 hour'::interval) AS after_1hour,
    (register_stamp::timestamp - '30 minutes'::interval) AS before_30_minutes,

    register_stamp::date AS register_date,
    (register_stamp::date + '1 day'::interval) AS after_1_day,
    (register_stamp::date - '1 month'::interval) AS before_1_month
FROM mst_users_with_dates;""")

# BigQuery : timestamp(), interval / timestamp_add, timestamp_sub, date_add, date_sub 사용
# timestamp(register_stamp) AS register_stamp,
# timestamp_add(timestamp(register_stamp), interval 1 hour) AS after_1_hour,
# timestamp_sub(timestamp(register_stamp), interval 30 minute) AS before_30_minutes,

# date(timestamp(register_stamp)) AS register_date,
# date_add(date(timestamp(register_stamp)), interval 1 day) AS after_1_day,
# date_sub(date(timestamp(register_stamp)), interval 1 month) AS beofre_1_month

Unnamed: 0,user,register_stamp,after_1hour,before_30_minutes,register_date,after_1_day,before_1_month
0,postgres,2016-02-28 10:00:00,2016-02-28 11:00:00,2016-02-28 09:30:00,2016-02-28,2016-02-29,2016-01-28
1,postgres,2016-02-29 10:00:00,2016-02-29 11:00:00,2016-02-29 09:30:00,2016-02-29,2016-03-01,2016-01-29
2,postgres,2016-03-01 10:00:00,2016-03-01 11:00:00,2016-03-01 09:30:00,2016-03-01,2016-03-02,2016-02-01


# 
### 날짜 데이터들의 차이 계산

In [52]:
query(""" 
SELECT user_id,
    CURRENT_DATE AS today,
    register_stamp::date AS register_date,
    CURRENT_DATE - register_stamp::date AS diff_days
FROM mst_users_with_dates;""")

# BigQuery :
# date(timestamp(register_stamp)) AS register_date,
# date_diff(CURRENT_DATE, date(timestamp(register_stamp)), day) AS diff_days

Unnamed: 0,user_id,today,register_date,diff_days
0,U001,2022-12-18,2016-02-28,2485
1,U002,2022-12-18,2016-02-29,2484
2,U003,2022-12-18,2016-03-01,2483


# 
### 사용자의 생년월일로 나이 계산
### ```age()``` : 날짜 자료형 데이터로 날짜를 계산 (**PostgreSQL만 가능**)

In [53]:
query(""" 
SELECT user_id,
    CURRENT_DATE AS today,
    register_stamp::date AS register_date,
    birth_date::date AS birth_date,
    EXTRACT(YEAR FROM age(birth_date::date)) AS current_age,
    EXTRACT(YEAR FROM age(register_stamp::date, birth_date::date)) AS register_age
FROM mst_users_with_dates;""")

# BigQuery :
# date(timestamp(register_stamp)) AS register_date,
# date(timestamp(birth_date)) AS birth_date,
# date_diff(CURRENT_DATE, date(timestamp(birth_date)), year) AS current_age
# date_diff(date(timestamp(register_stamp)), date(timestamp(birth_date)), year) AS register_age

Unnamed: 0,user_id,today,register_date,birth_date,current_age,register_age
0,U001,2022-12-18,2016-02-28,2000-02-29,22.0,15.0
1,U002,2022-12-18,2016-02-29,2000-02-29,22.0,16.0
2,U003,2022-12-18,2016-03-01,2000-02-29,22.0,16.0


# 
#### 날짜의 정수 표현
- 생일이 2000년 2월 29일인 사람의 2016년 2월 18일 시점 나이 계산

In [54]:
query("""SELECT floor((20160228 - 20000229) / 10000) AS age;""")

Unnamed: 0,age
0,15.0


# 
- 문자열로 시차 계산 : 날짜/시간 데이터의 계산은 프로그램에 따라 표현에 차이가 큼 -> **따라서 실무에서는 수치 혹은 문자열로 변환해 다루는 경우도 많음**

In [70]:
query(""" 
SELECT user_id,
    substring(register_stamp, 1, 10) AS register_date,
    birth_date,
    floor(
        ( CAST(replace(substring(register_stamp, 1, 10), '-', '') AS integer) - CAST(replace(birth_date, '-', '') AS integer)) / 10000
    ) AS register_age,
    floor(
        ( CAST(replace(CAST(CURRENT_DATE AS text), '-', '') AS integer) - CAST(replace(birth_date, '-', '') AS integer)) / 10000
    ) AS current_age
FROM mst_users_with_dates;""")

# BigQuery : text를 string으로 / integer을 in64로

Unnamed: 0,user_id,register_date,birth_date,register_age,current_age
0,U001,2016-02-28,2000-02-29,15.0,22.0
1,U002,2016-02-29,2000-02-29,16.0,22.0
2,U003,2016-03-01,2000-02-29,16.0,22.0


# 
# 
# 
### 6-6 IP 주소 다루기
- IP주소를 로그로 저장 시, 문자열로 저장

# 

### IP 주소 자료형 활용
- PostgreSQL에만 있는 inet 자료형 활용 (inet : IP주소를 다루기 위한 자료형)

# 

- IP주소 비교

In [74]:
query("""SELECT 
    CAST('127.0.0.1' AS inet) < CAST('127.0.0.2' AS inet) AS lt,
    CAST('127.0.0.1' AS inet) > CAST('127.168.0.1' AS inet) AS gt""")

Unnamed: 0,lt,gt
0,True,False


# 
- 네트워크 범위에 IP주소가 포함되는지 판정

In [76]:
query("""SELECT CAST('127.0.0.1' AS inet) << CAST('127.0.0.0/8' AS inet) AS is_contained""")

Unnamed: 0,is_contained
0,True


# 
### 정수 또는 문자열로 IP 주소 다루기
- IP주소를 4개의 10진수 부분을 추출

In [77]:
query(""" 
SELECT ip,
    CAST(split_part(ip, '.', 1) AS integer) AS ip_part1,
    CAST(split_part(ip, '.', 2) AS integer) AS ip_part2,
    CAST(split_part(ip, '.', 3) AS integer) AS ip_part3,
    CAST(split_part(ip, '.', 4) AS integer) AS ip_part4
FROM (SELECT CAST('192.168.0.1' AS text) AS ip) AS t""")

# BigQuery :
# CAST(split(ip, '.')[SAFE_ORDINAL(1)] AS int64) AS ip_part1 
# ....

Unnamed: 0,ip,ip_part1,ip_part2,ip_part3,ip_part4
0,192.168.0.1,192,168,0,1


# 
- IP주소를 정수 자료형 표기로 변환 : 각 부분을 2^24, 2^16, 2^8, 2^0만큼 곱

In [79]:
query(""" 
SELECT ip,
    CAST(split_part(ip, '.', 1) AS integer) * 2^24 + 
    CAST(split_part(ip, '.', 2) AS integer) * 2^16 + 
    CAST(split_part(ip, '.', 3) AS integer) * 2^8 + 
    CAST(split_part(ip, '.', 4) AS integer) * 2^0 AS ip_integer
FROM (SELECT CAST('192.168.0.1' AS text) AS ip) AS t;""")

# BigQuery :
# CAST(split(ip, '.')[SAFE_ORDINAL(1)] AS int64) * pow(2, 24)
# ...

Unnamed: 0,ip,ip_integer
0,192.168.0.1,3232236000.0


# 
- IP주로슬 0으로 메우기 : 각 10진수 부분을 3자리 숫자가 되게, 앞 부분을 0으로 메움

### ```lpad( 문자, n, 대체문자)``` : 문자의 길이가 n이 되게, 대체문자로 메움

In [82]:
query("""
SELECT ip,
    lpad(split_part(ip, '.', 1), 3, '0') || 
    lpad(split_part(ip, '.', 2), 3, '0') || 
    lpad(split_part(ip, '.', 3), 3, '0') ||
    lpad(split_part(ip, '.', 4), 3, '0') AS ip_padding 
FROM (SELECT CAST('192.168.0.1' AS text) AS ip) AS t;""")

# BigQuery :
# CONCAT(lpad(split(ip, '.')[SAFE_ORDINAL(1)], 3, '0'), ... )

Unnamed: 0,ip,ip_padding
0,192.168.0.1,192168000001
