# 05. 하나의 값 조작하기
# 

## Python, PostgreSQL 연결

### 1. psycopg2 

### ```pip install psycopg2-binary```
### ```psycopg2.connect(host = "DB주소", dbname = "DB이름", user = "사용자계정", password = "비밀번호", port = 포트번호)```

In [19]:
import psycopg2
import pandas as pd

In [7]:
db = psycopg2.connect(host='localhost', dbname='PostgreSQL',
                      user='postgres', password='skt1faker', port=5432)

# 
### 커서 생성

In [8]:
cur = db.cursor()

# 
### 쿼리 실행
### ```cur.execute()```
### ```cur.commit()```

# 

### 결과 조회
### ```cur.fecthone() : 단일 결과 반환 (Tuple)```
### ```cur.fetchmany( n ) : n개 결과 반환 (List of Tuple)```
### ```cur.fecthall() : 모든 결과 반환 (List of Tuple)```

# 
### 연결 종료
### ```cur.close()```

# 
# 
# 
### 2. sqlalchemy 
### ```pip install sqlalchemy```


In [21]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

In [30]:
db = create_engine("postgresql+psycopg2://postgres:skt1faker@localhost:5432/PostgreSQL", 
     echo=False) # echo = True일 시 로그 기록 출력

In [47]:
def query(query_state, db=db):
    return pd.read_sql(query_state, db)

# 
### 5-1 코드 값을 레이블로 변경
- 로그 데이터 또는 업무 데이터로 저장된 코드 값을 그대로 집계에 사용하면, 리포트 가독성이 감소
- 리포트를 작성할 때, 변환하는 작업이 필요

(회원 등록 때 사용한 장치를 저장하는 컬럼 (**register_device**)이 코드 값 (1: 데스크톱, 2: 스마트폰. 3: 애플리케이션)으로 저장)

In [48]:
query("""SELECT * FROM mst_users""")

Unnamed: 0,user_id,register_date,register_device
0,U001,2016-08-26,1
1,U002,2016-08-26,2
2,U003,2016-08-27,3


# 
#### CASE 문 :
### ```WHEN <조건문> THEN <조건을 만족할 때의 값> END```

In [49]:
query("""
SELECT user_id,
    CASE
        WHEN register_device = 1 THEN '데스크톱'
        WHEN register_device = 2 THEN '모바일'
        WHEN register_device = 3 THEN '애플리케이션'
    END AS device_name
FROM mst_users;""")

Unnamed: 0,user_id,device_name
0,U001,데스크톱
1,U002,모바일
2,U003,애플리케이션


# 
# 
# 
### 5-2 URL에서 요소 추출

# 

### 레퍼러로 어떤 웹 페이지를 거쳐 넘어왓는지 판별

- **레퍼러** : 어떤 웹 페이지를 거쳐 넘어왔는지 판별 시 사용 (호스트 단위로 집계)

In [54]:
query("""SELECT * FROM access_log;""")

Unnamed: 0,stamp,referrer,url
0,2016-08-26 12:02:00,http://www.other.com/path1/index.php?k1=v1&k2=...,http://www.example.com/video/detail?id=001
1,2016-08-26 12:02:01,http://www.other.net/path1/index.php?k1=v1&k2=...,http://www.example.com/video#ref
2,2016-08-26 12:02:01,https://www.other.com/,http://www.example.com/book/detail?id=002


# 
### ```substring()``` : 문자열 일부 추출


In [60]:
query("""
SELECT stamp,
    substring(referrer from 'https?://([^/]*)') AS referrer_host
FROM access_log;""")

# BigQuery : host(referrer) AS referrer_host

Unnamed: 0,stamp,referrer_host
0,2016-08-26 12:02:00,www.other.com
1,2016-08-26 12:02:01,www.other.net
2,2016-08-26 12:02:01,www.other.com


# 
### URL에서 경로와 요청 매개변수 값 추출

In [65]:
query("""
SELECT stamp, url,
    substring(url from '//[^/]+([^?#]+)') AS path,
    substring(url from 'id=([^&]*)') AS id
FROM access_log""")

# BigQuery : 
# regex_extract(url, '//[^/]+([^?#]+)') AS path,
# regex_extract(url, 'id=([^&]*)') AS id

Unnamed: 0,stamp,url,path,id
0,2016-08-26 12:02:00,http://www.example.com/video/detail?id=001,/video/detail,1.0
1,2016-08-26 12:02:01,http://www.example.com/video#ref,/video,
2,2016-08-26 12:02:01,http://www.example.com/book/detail?id=002,/book/detail,2.0


# 
# 
# 
### 5-3 문자열을 배열로 분해

### ```split_part()```

In [71]:
query("""
SELECT stamp, url,
    split_part(substring(url from '//[^/]+([^?#]+)'), '/', 2) AS path1,
    split_part(substring(url from '//[^/]+([^?#]+)'), '/', 3) AS path2
FROM access_log;""")

# BigQuery :
# split(regexp_extract(url, '//[^/]+([^?#]+)'), '/')[SAFE_ORDINAL(2)] AS path1,
# split(regexp_extract(url, '//[^/]+([^?#]+)'), '/')[SAFE_ORDINAL(3)] AS path2

Unnamed: 0,stamp,url,path1,path2
0,2016-08-26 12:02:00,http://www.example.com/video/detail?id=001,video,detail
1,2016-08-26 12:02:01,http://www.example.com/video#ref,video,
2,2016-08-26 12:02:01,http://www.example.com/book/detail?id=002,book,detail


# 
# 
# 
### 5-4 시계열

# 

### 현재 날짜와 타임스탬프 추출

### ```CURRENT_DATE``` : 현재 날짜
### ```CURRENT_TIMESTAMP``` : 현재 시간

In [76]:
query("""SELECT CURRENT_DATE AS dt,
    CURRENT_TIMESTAMP AS stamp;""")

# BigQuery :
# CURRENT_DATE() AS dt,
# CURRENT_TIMESTAMP() AS stamp

Unnamed: 0,dt,stamp
0,2022-12-17,2022-12-17 14:31:50.728361+00:00


# 
### 지정한 값의 날짜/시간 데이터 추출
### ```CAST()``` : 데이터 형 변환

In [81]:
query("""
SELECT stamp, 
    CAST(stamp AS date) AS cast_date,
    CAST(stamp AS timestamp) AS cast_timestamp
FROM access_log;""")

# BigQuery :
# date('2016-01-30') AS dt,
# timestamp('2016-01-30 12:00:00') AS stamp

Unnamed: 0,stamp,cast_date,cast_timestamp
0,2016-08-26 12:02:00,2016-08-26,2016-08-26 12:02:00
1,2016-08-26 12:02:01,2016-08-26,2016-08-26 12:02:01
2,2016-08-26 12:02:01,2016-08-26,2016-08-26 12:02:01


# 
### ```EXTRACT()``` : 날짜/시간에서 특정 필드 (연/월/일) 추출 (timestamp 데이터에만 사용 가능)
- ```YEAR``` : 연
- ```MONTH``` : 월
- ```DAY``` : 일
- ```HOUR``` : 시
- ```MINUTE``` : 분
- ```SECOND``` : 초

In [104]:
query("""
SELECT stamp,
    EXTRACT(YEAR FROM stamp) AS year,
    EXTRACT(MONTH FROM stamp) AS month,
    EXTRACT(DAY FROM stamp) AS day,
    EXTRACT(HOUR FROM stamp) AS hour,
    EXTRACT(MINUTE FROM stamp) AS minute,
    EXTRACT(SECOND FROM stamp) AS second
FROM 
    (SELECT CAST('2016-01-30 12:00:00' AS timestamp) AS stamp) As t;""")

Unnamed: 0,stamp,year,month,day,hour,minute,second
0,2016-01-30 12:00:00,2016.0,1.0,30.0,12.0,0.0,0.0


# 

In [107]:
query("""
SELECT stamp,
    substring(stamp, 1, 4) AS year,
    substring(stamp, 6, 2) AS month,
    substring(stamp, 9, 2) AS day,
    substring(stamp, 12, 2) AS hour,
    substring(stamp, 1, 7) AS year_month
FROM (SELECT CAST('2016-01-30 12:00:00' AS text) AS stamp) AS t;""") # BigQuery의 경우 text가 아닌 string

Unnamed: 0,stamp,year,month,day,hour,year_month
0,2016-01-30 12:00:00,2016,1,30,12,2016-01


# 
# 
# 
### 5-5 결측값 대체
### ```COALESCE( 열, 대체값)``` : 해당 열의 결측값을 대체

In [105]:
query("SELECT * FROM purchase_log_with_coupon;")

Unnamed: 0,purchase_id,amount,coupon
0,10001,3280,
1,10002,4650,500.0
2,10003,3870,


In [106]:
query("""
SELECT purchase_id, amount, coupon,
    amount - coupon AS discount_amount1,
    amount - COALESCE(coupon, 0) AS discount_amount2
FROM purchase_log_with_coupon;""")

Unnamed: 0,purchase_id,amount,coupon,discount_amount1,discount_amount2
0,10001,3280,,,3280
1,10002,4650,500.0,4150.0,4150
2,10003,3870,,,3870
