# 라이브러리와 데이터 로딩

In [None]:
import pandas as pd 
import numpy as np

In [None]:
!pip install pydataset #매번 설치 필요

In [None]:
from pydataset import data #데이터셋 

In [None]:
house = data('Housing') #Housing 데이터를 house에 할당

## 그래프 사이즈 

In [None]:
import matplotlib.pyplot as plt

In [None]:
#출력 그래프 사이즈 전체 설정

plt.rcParams['figure.figsize'] = [7, 5]
plt.rcParams['figure.dpi'] = 100

## Pandas 그래프 종류

- ‘line’ : line plot (default)

- ‘bar’ : vertical bar plot

- ‘barh’ : horizontal bar plot

- ‘hist’ : histogram

- ‘boxplot’ : boxplot

- ‘kde’ : Kernel Density Estimation plot

- ‘density’ : ‘kde’

- ‘area’ : area plot

- ‘pie’ : pie plot

- ‘scatter’ : scatter plot (DataFrame only)

- ‘hexbin’ : hexbin plot (DataFrame only)

In [None]:
house.dtypes

In [None]:
house.plot(figsize=(12,7)) #기본 = line

In [None]:
house.plot(kind = 'line', 
           subplots = True,
           figsize = (12, 7))

In [None]:
house.plot.line(subplots = True, figsize = (12, 7)) #위 코드와 비교

## line plot

<img src = "https://pandas.pydata.org/pandas-docs/version/0.23.0/_images/pandas-DataFrame-plot-line-1.png">

In [None]:
house.plot(y = 'price',
           x =  'bedrooms',
           kind = 'line', 
           title = 's')

#문제점은?

In [None]:
#내림차순 정렬

house_bedrooms_desc = house.sort_values(by=['bedrooms'], ascending=False)

In [None]:
house_bedrooms_desc.plot(y = 'price',
                         x =  'bedrooms',
                         kind = 'line', 
                         title = 'price * bedrooms',
                         figsize=(10, 4))

## bar plot

- 명목 변수가 갖는 값을 사각형 막대로 표시한 것
- 개별 변수의 빈도를 보여주는 histogram 과의 차이점 주의

<img src = "https://pandas.pydata.org/docs/_images/pandas-DataFrame-plot-bar-2.png">

In [None]:
print(house.head())

In [None]:
# bathrms와 bedrooms의 관계를 bar plot으로 표현하기
# 가장 많이 등장하는 조합을 시각화하기

house.plot(y = 'bathrms',
           x =  'bedrooms',
           kind = 'bar', 
           title = 'the combination of bathrms * bedrooms')

# 어떤 문제점?

In [None]:
# 방법 (1) - mode()
house[['bedrooms', 'bathrms']].mode()

In [None]:
house_mode = house[(house.bedrooms == 3) & (house.bathrms == 1)]
print(house_mode)

In [None]:
house_mode.plot(y = 'bathrms',
            x =  'bedrooms',
            kind = 'bar',
            title = 'bathrms * bedrooms')

#그러나 시각화에는 여전히 문제점 존재

In [None]:
# groupby() + size()를 통한 명목 변수화
# size() : Return an int representing the number of elements in this object.

house_group = (house.groupby(['bedrooms', 'bathrms'])).size()
print(house_group)
print(type(house_group)) 

In [None]:
house_group.plot(y = 'bathrms',
                 x =  'bedrooms',
                 kind = 'bar',
                 title = 'bathrms * bedrooms')

In [None]:
# groupby() + size()를 통한 명목 변수화
# size() : Return an int representing the number of elements in this object.

house_group = (house.groupby(['bedrooms', 'bathrms'])).size()
print(house_group)
print(type(house_group)) 

#출력 결과는 마치 데이터프레임처럼 보이지만 
#새로운 연산 (manipulation) 후 어떤 데이터 타입인지 확인 중요

In [None]:
house_group2 = (house.groupby(['bedrooms', 'bathrms'])).size().to_frame()
print(house_group2)
print(type(house_group2))

#이 데이터프레임의 index는?

In [None]:
house_group2.index.values

#index 리셋 필요

In [None]:
house_group3 = (house.groupby(['bedrooms', 'bathrms'])).size().to_frame(name='size').reset_index()
print(house_group3) #리셋된 index 재확인

In [None]:
house_group3.index.values #리셋된 index 재확인

## histogram - hist( )

- 빈도를 사용해 분포를 파악하기 위한 목적

<img src = "https://pandas.pydata.org/pandas-docs/version/0.23.0/_images/pandas-DataFrame-plot-hist-1.png">

In [None]:
house.hist()

In [None]:
house.bathrms.hist()

In [None]:
house.hist(column='bathrms')

In [None]:
house.hist(column='bedrooms', by='bathrms', bins = 2) #bathrms 기준 bedrooms의 빈도 #bins 값 커질수록 

## boxplot - plot.box( ) 혹은 boxplot( )

<img src = "https://miro.medium.com/max/1400/1*2c21SkzJMf3frPXPAR_gZA.png" width = 700>

In [None]:
np.min(house.lotsize)

In [None]:
np.percentile(house.lotsize, 25)  # Q1

In [None]:
np.percentile(house.lotsize, 50)  # median

In [None]:
np.percentile(house.lotsize, 75)  # Q3

In [None]:
np.max(house.lotsize)

In [None]:
np.percentile(house.lotsize, [0, 25, 50, 75, 100]) 

In [None]:
house.boxplot()

In [None]:
house['lotsize'].boxplot() #에러

In [None]:
house.lotsize.plot.box() #정상

In [None]:
house.boxplot(column="lotsize") #정상

In [None]:
house.boxplot(column="price", by="bedrooms")

In [None]:
house.boxplot(column="price", by="bedrooms", rot = 45)

In [None]:
house.boxplot(column=["price", 'lotsize'], by="bedrooms")

In [None]:
house.boxplot(column=["price", 'lotsize'], by="bedrooms", layout=[2,1])

## KDE (Kernel Density Estimate)

- 연속 확률 변수의 분포를 시각화

<img src = "https://miro.medium.com/max/1278/1*bHlmeUZ0eaugeBdl4oXDqA.png" width = 500>

In [None]:
house.plot.kde()

In [None]:
house.lotsize.plot.kde()

## area plot

In [None]:
house.plot.area()

In [None]:
house.plot(kind = 'area', stacked = True)

In [None]:
house.plot(kind = 'area', stacked = False)

In [None]:
house.price.plot.area()

In [None]:
house.price.plot(kind = 'area')

In [None]:
house.plot.area(y='price')

## pie plot

In [None]:
house_group3 #multi index 형태

In [None]:
newindex = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O']
house_group3.index = list(newindex)
print(house_group3)

In [None]:
house_group3.plot.pie(y = 'size')

In [None]:
house_group3.plot.pie(subplots = True, figsize = (20,10))

## scatter plot

- 두 변수의 상관 관계를 점(point)의 산포로 표현

In [None]:
house.plot(x='lotsize', y ='price', kind = 'scatter')

## hexbin plot
- 두 변수 값이 갖는 분포를 6각형 bin 의 짙음으로 표현

In [None]:
house.plot.hexbin(x = 'lotsize', y = 'price', gridsize = 25)

In [None]:
house.bedrooms.value_counts()

In [None]:
house.plot.hexbin(x = 'lotsize', 
                  y = 'price', gridsize = 25, 
                  C='bedrooms', 
                  reduce_C_function=np.min)

In [None]:
# 파일로 내보내기
ax = house.plot.hexbin(x = 'lotsize', y = 'price', gridsize = 25)
ax.figure.savefig('demo-file2.png')