#### DB 연결

In [1]:
import psycopg2

conn = psycopg2.connect(
  'host=localhost port=5432 dbname=source user=postgres password=postgres'
)
conn.set_session(autocommit=True)
cur = conn.cursor()

#### 데이터 읽기

In [2]:
import pandas as pd

cur.execute(
  f'''
  select *
  from lot_public_price
  where substr(pnu,1,2) = '11'
  '''
)
lpp_dt = pd.DataFrame(
  cur.fetchall(),
  columns=[
    col.name
    for col
    in cur.description
  ]
)

In [3]:
lpp_dt

Unnamed: 0,year,stdmt,pnu,land_seqno,sgg_cd,land_loc_cd,land_gbn,bobn,bubn,adm_umd_cd,pnilp,jimok,parea,spfc1,spfc2,land_use,geo_hl,geo_form,road_side,upload_at
0,2022,01,1111017100102160001,999999,11110,17100,1,0216,0001,913,8140000,08,80,14,00,210,02,04,08,2022-09-13 17:28:38.492385
1,2022,01,1111016600101180003,999999,11110,16600,1,0118,0003,912,3985000,08,148.8,14,00,130,02,04,10,2022-09-13 17:28:38.492385
2,2022,01,1111017300100520004,999999,11110,17300,1,0052,0004,914,4535000,08,112.1,14,00,110,02,04,09,2022-09-13 17:28:38.492385
3,2022,01,1111017400100250006,999999,11110,17400,1,0025,0006,915,12450000,08,74,15,00,210,02,04,03,2022-09-13 17:28:38.492385
4,2022,01,1111017400100230673,999999,11110,17400,1,0023,0673,915,2760000,08,138.8,14,00,110,04,04,10,2022-09-13 17:28:38.492385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903772,2022,01,1111016600100280021,22256,11110,16600,1,0028,0021,912,5495000,08,77044.2,14,00,160,02,05,02,2022-09-13 17:28:38.492385
903773,2022,01,1111016500100280012,21969,11110,16500,1,0028,0012,912,3845000,08,66.2,14,00,130,03,05,08,2022-09-13 17:28:38.492385
903774,2022,01,1111016500100280017,21971,11110,16500,1,0028,0017,912,4587000,08,185.3,14,00,130,03,02,07,2022-09-13 17:28:38.492385
903775,2022,01,1111016500100280018,21972,11110,16500,1,0028,0018,912,6408000,08,174.2,14,00,310,02,04,07,2022-09-13 17:28:38.492385


### 항목별 특성 확인

In [7]:
lpp_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903777 entries, 0 to 903776
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   year         903777 non-null  object        
 1   stdmt        903777 non-null  object        
 2   pnu          903777 non-null  object        
 3   land_seqno   903777 non-null  object        
 4   sgg_cd       903777 non-null  object        
 5   land_loc_cd  903777 non-null  object        
 6   land_gbn     903777 non-null  object        
 7   bobn         903777 non-null  object        
 8   bubn         903777 non-null  object        
 9   adm_umd_cd   903777 non-null  object        
 10  pnilp        903777 non-null  object        
 11  jimok        903777 non-null  object        
 12  parea        903777 non-null  object        
 13  spfc1        903777 non-null  object        
 14  spfc2        903777 non-null  object        
 15  land_use     903777 non-null  obje

숫자형 데이터의 확인을 위해 형변환

In [11]:
lpp_dt.land_seqno = lpp_dt.land_seqno.astype('int')
lpp_dt.pnilp = lpp_dt.pnilp.astype('int')
lpp_dt.parea = lpp_dt.parea.astype('float')

In [12]:
lpp_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903777 entries, 0 to 903776
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   year         903777 non-null  object        
 1   stdmt        903777 non-null  object        
 2   pnu          903777 non-null  object        
 3   land_seqno   903777 non-null  int32         
 4   sgg_cd       903777 non-null  object        
 5   land_loc_cd  903777 non-null  object        
 6   land_gbn     903777 non-null  object        
 7   bobn         903777 non-null  object        
 8   bubn         903777 non-null  object        
 9   adm_umd_cd   903777 non-null  object        
 10  pnilp        903777 non-null  int32         
 11  jimok        903777 non-null  object        
 12  parea        903777 non-null  float64       
 13  spfc1        903777 non-null  object        
 14  spfc2        903777 non-null  object        
 15  land_use     903777 non-null  obje

#### 기초통계량 확인

In [24]:
import numpy as np

lpp_dt.describe(include=np.number).transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
land_seqno,903777.0,51650.61,177393.9,1.0,9016.0,18065.0,28652.0,999999.0
pnilp,903777.0,4943092.0,5347416.0,4950.0,2537000.0,3806000.0,5730000.0,189000000.0
parea,903777.0,664.7184,11887.06,0.1,75.4,140.4,240.7,4416995.0


- land_seqno(토지일련번호)는 고유값이 아니며 활용성이 없는 항목
- pnilp(공시지가)는 테이블 내 종속변수의 특성이 강하며 단위가 크고 범위가 넓은 편
- parea(토지면적)는 대체로 값이 작으나 최대값이 이상치 수준으로 큼

In [25]:
lpp_dt.describe(include=np.object).transpose()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  lpp_dt.describe(include=np.object).transpose()


Unnamed: 0,count,unique,top,freq
year,903777,1,2022,903777
stdmt,903777,1,1,903777
pnu,903777,903777,1111017100102160001,1
sgg_cd,903777,25,11290,54837
land_loc_cd,903777,87,10100,115836
land_gbn,903777,9,1,885434
bobn,903777,3797,1,14270
bubn,903777,3834,0,53972
adm_umd_cd,903777,186,0,215937
jimok,903777,25,8,675581


- year(년도)는 '2022' 유일값
- stdmt(기준월)는 '01' 유일값
- pnu(필지고유번호)는 'count == unique'인 primary key로 분석 내 ID로 사용
- sgg_cd(시군구코드)는 'unique:25'로 서울 자치구 25개의 코드
- land_loc_cd(토지소재지코드)는 'unique:87'로 서울 자치구 내 법정동 코드. 단 sgg_cd의 하위 분류코드로서 동일한 값이어도 다른 법정동을 의미할 수 있음
- land_gbn(토지구분), bobn(본번), bubn(부번)은 토지 주소의 부분 항목
- adm_umd_cd(행정읍면동코드)는 land_loc_cd가 나타내는 법정동이 아닌 행정동 기준의 코드
- jimok(지목)는 25개 지목 구분 코드
- spfc1(용도지역1), spfc2(용도지역2)는 용도지역 15개 구분 코드. spfc2는 공란에 해당되는 '00'이 추가되어 unique값이 1 높음
- land_use(토지이용상황)는 토지의 이용상황 44개 분류 코드
- geo_hl(지형고저)은 토지의 지형고저 6개 분류 코드
- geo_form(지형형상)은 토지의 지형형상 7개 분류 코드
- road_side(도로접면)은 토지에 접한 도로에 대한 13개 분류 코드