In [15]:
## 필요한 라이브러리 로딩
import pandas as pd
from datetime import datetime, timedelta
from pandas.api.types import CategoricalDtype
import plotly.graph_objects as go

## 1. covid19 원본 데이터 셋 로딩
## covid19 데이터 로딩(파일을 다운로드 받은 경우)
df_covid19 = pd.read_csv("D:/R/data/Rnpy/owid-covid-data.csv")

## covid19 데이터 로딩(온라인에서 바로 로딩할 경우)
##df_covid19 = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv")

df_covid19['date'] = pd.to_datetime(df_covid19['date'], format="%Y-%m-%d")

df_covid19_100 = df_covid19[(df_covid19['iso_code'].isin(['KOR', 'OWID_ASI', 'OWID_EUR', 'OWID_OCE', 'OWID_NAM', 'OWID_SAM', 'OWID_AFR'])) & (df_covid19['date'] >= (max(df_covid19['date']) - timedelta(days = 100)))]


df_covid19_100.loc[df_covid19_100['location'] == 'South Korea', "location"] = '한국'
df_covid19_100.loc[df_covid19_100['location'] == 'Asia', "location"] = '아시아'
df_covid19_100.loc[df_covid19_100['location'] == 'Europe', "location"] = '유럽'
df_covid19_100.loc[df_covid19_100['location'] == 'Oceania', "location"] = '오세아니아'
df_covid19_100.loc[df_covid19_100['location'] == 'North America', "location"] = '북미'
df_covid19_100.loc[df_covid19_100['location'] == 'South America', "location"] = '남미'
df_covid19_100.loc[df_covid19_100['location'] == 'Africa', "location"] = '아프리카'

ord = CategoricalDtype(categories = ['한국', '아시아', '유럽', '북미', '남미', '아프리카', '오세아니아'], ordered = True)

df_covid19_100['location'] = df_covid19_100['location'].astype(ord)

df_covid19_100 = df_covid19_100.sort_values(by = 'date')

df_covid19_100_wide = df_covid19_100.loc[:,['date', 'location', 'new_cases', 'people_fully_vaccinated_per_hundred']].rename(columns={'new_cases':'확진자', 'people_fully_vaccinated_per_hundred':'백신접종완료자'})

df_covid19_100_wide = df_covid19_100_wide.pivot(index='date', columns='location', values=['확진자', '백신접종완료자']).sort_values(by = 'date')

df_covid19_100_wide.columns = ['확진자_한국', '확진자_아시아', '확진자_유럽', '확진자_북미', '확진자_남미', '확진자_아프리카','확진자_오세아니아',
                              '백신접종완료자_한국', '백신접종완료자_아시아', '백신접종완료자_유럽', '백신접종완료자_북미', '백신접종완료자_남미', '백신접종완료자_아프리카','백신접종완료자_오세아니아']
                              
df_covid19_stat = df_covid19.groupby(['iso_code', 'continent', 'location'], dropna=True).agg(
    인구수 = ('population', 'max'),
    전체사망자수 = ('new_deaths', 'sum'), 
    백신접종자완료자수 = ('people_fully_vaccinated', 'max'),
    인구백명당백신접종완료율 = ('people_fully_vaccinated_per_hundred', 'max'),
    인구백명당부스터접종자수 = ('total_boosters_per_hundred', 'max')

).reset_index()

df_covid19_stat['십만명당사망자수'] = round(df_covid19_stat['전체사망자수'] / df_covid19_stat['인구수'] *100000, 5)

df_covid19_stat['백신접종완료율'] = df_covid19_stat['백신접종자완료자수'] / df_covid19_stat['인구수']

## 여백 설정을 위한 변수 설정
margins_P = {'t' : 50, 'b' : 25, 'l' : 25, 'r' : 25}

######################################   
## python 코드
## 대학 학과 취업률 데이터 셋

df_취업률 = pd.read_excel("d:/R/data/Rnpy/2021년 학과별 고등교육기관 취업통계.xlsx", 
                           sheet_name = '학과별',
                           skiprows=(13), 
                           header = 0)

df_취업률 = pd.concat([df_취업률.iloc[:, 0:8], 
                    df_취업률.loc[:, df_취업률.columns.str.endswith('계')], 
                    df_취업률.loc[:, '입대자']], 
                   axis = 1
                   )

df_취업률_500 = df_취업률.loc[(df_취업률['졸업자_계'] < 500)]

df_취업률_500 = df_취업률_500.iloc[range(0, len(df_취업률_500.index) , 4)]

df_취업률_500 = df_취업률_500.rename(columns = {'졸업자_계':'졸업자수', '취업률_계':'취업률', '취업자_합계_계':'취업자수'})





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
fig = go.Figure()

colors = {'의약계열': 0, '인문계열': 1, '사회계열': 2, '교육계열': 3, '공학계열': 4, '자연계열': 5, '예체능계열': 6}

for cat, group in df_취업률_500.groupby('대계열'):
    fig.add_trace(go.Scatter(
        mode = 'markers',
        x = group['졸업자수'], y = group['취업자수'],
        name = cat,
        marker = dict(color = colors[cat]), 
        showlegend = True
    ))

fig.update_layout(title = dict(text = '<b>졸업자 대비 취업자수</b>', x = 0.5, font = dict(color = 'white')), 
         margin = dict(t = 50, b = 25, l = 25, r = 25), 
         paper_bgcolor = 'black', plot_bgcolor = 'black', 
         xaxis = dict(color = 'white', ticksuffix = '명', showgrid = False), 
         yaxis = dict(color = 'white', gridcolor = 'gray', ticksuffix = '명', dtick = 100), 
         legend = dict(font = dict(color = 'white')),
                  colorway = ("#EFF3FF", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#084594")
                 )
fig.show()

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-1.pdf')

In [17]:
import numpy as np
import statsmodels.api as sm # to build a LOWESS model
from sklearn.linear_model import LinearRegression

############## 선형회귀 모델
linear_regr = LinearRegression()

X = df_취업률_500['졸업자수'].values.reshape(-1,1)  # 독립변수（NumPy의 배열）
Y = df_취업률_500['취업자수'].values         # 종속변수（Numpy의 배열）

linear_regr.fit(X, Y)                         # 선형 모델의 가중치를 학습
linear_fit = linear_regr.predict(X)

############## Loess 모델
lowess_fit = sm.nonparametric.lowess(df_취업률_500['취업자수'], df_취업률_500['졸업자수'])

fig = go.Figure()

fig.add_trace(go.Scatter(
    mode = 'markers',
    x = df_취업률_500['졸업자수'], y = df_취업률_500['취업자수'], 
    showlegend = False))

## 선형 회귀 추세선 추가
fig.add_trace(go.Scatter(
    mode = 'lines', 
    x = df_취업률_500['졸업자수'], y = linear_fit, 
    name = '선형추세선', 
    line = dict(dash = 'dot', color = 'darkblue')))

## 국소 회귀 추세선 추가
fig.add_trace(go.Scatter(
    mode = 'lines', 
    x = lowess_fit[:,0], y = lowess_fit[:,1], 
    name = 'loess', 
    line = dict(color = 'skyblue')))

fig.show()

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-4.pdf')

In [31]:
import plotly.express as px
## plotly.express.scatter()로 선형 회귀 추세선 추가
fig = px.scatter(df_취업률_500, x= '졸업자수', y="취업자수", 
                 color = "대계열", trendline = 'ols', color_discrete_sequence = px.colors.sequential.Blues)
                 
## plotly.express.scatter()로 국소 회귀 추세선 추가
#fig = px.scatter(df_취업률_500, x= '졸업자수', y="취업자수", 
#                 color = "대계열", trendline = 'lowess')
                 
fig.show()

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-5.pdf')

In [19]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    mode = 'markers', 
    x = df_covid19_stat['백신접종완료율'], 
    y = df_covid19_stat['인구백명당부스터접종자수'], 
    ## marker의 사이즈를 사용해 버블 차트 구현
    marker = dict(size = df_covid19_stat['십만명당사망자수'], opacity = 0.5, sizemode = 'area')
))

fig.show()

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-6.pdf')

In [20]:
fig = go.Figure()

fig.add_trace(go.Histogram(x = df_취업률_500.loc[df_취업률_500['과정구분'] == '전문대학과정', '취업률'], 
                           name = "전문대학과정", xbins = dict(size = 5), opacity=0.4
                          )
             )

fig.add_trace(go.Histogram(x = df_취업률_500.loc[df_취업률_500['과정구분'] == '대학과정', '취업률'], 
                           name = '대학과정', xbins = dict(size = 5), opacity=0.4
                          )
             )

fig.add_trace(go.Histogram(x = df_취업률_500.loc[df_취업률_500['과정구분'] == '대학원과정', '취업률'], 
                           name = '대학원과정', xbins = dict(size = 5), opacity=0.4
                          )
             )
fig.update_layout(barmode='overlay', colorway = ("#2171B5", "#08519C", "#08306B"), title = dict(text = '취업률 Histogram', x = 0.5))

fig.show()

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-8.pdf')

In [21]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=2, 
                    subplot_titles = ("histfuct = 'count'", "histfuct = 'sum'", "histfuct = 'avg'", "histfuct = 'max'"))

fig.add_trace(go.Histogram(x = df_취업률_500['대계열'], y = df_취업률_500['취업률'],
                           histfunc = 'count', showlegend = False), 
             row = 1, col = 1)
fig.add_trace(go.Histogram(x = df_취업률_500['대계열'], y = df_취업률_500['취업률'],
                           histfunc = 'sum', showlegend = False, 
                          marker = dict(color = '#636EFA')), 
             row = 1, col = 2)
fig.add_trace(go.Histogram(x = df_취업률_500['대계열'], y = df_취업률_500['취업률'],
                           histfunc = 'avg', showlegend = False, 
                          marker = dict(color = '#636EFA')), 
             row = 2, col = 1)
fig.add_trace(go.Histogram(x = df_취업률_500['대계열'], y = df_취업률_500['취업률'],
                           histfunc = 'max', showlegend = False, 
                          marker = dict(color = '#636EFA')), 
             row = 2, col = 2)
fig.update_layout(title = dict(text = '취업률 Histogram', x = 0.5))

fig.show()

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-10.pdf')

In [22]:
fig = go.Figure()
fig.add_trace(go.Box(
    x = df_취업률['대계열'], y = df_취업률['취업률_계'], 
    ## boxmean과 notched 설정
    boxmean = 'sd', notched = True))

fig.update_layout(title = dict(text = '대학 계열별 취업률 분포', x = 0.5)) 

fig.show()    

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-12.pdf')

In [23]:
fig = go.Figure()
fig.add_trace(go.Box(
  y = df_covid19_100_wide['확진자_한국'], name = '한국',
  boxpoints = "all", jitter = 0.3, pointpos = -1.8))
  
fig.add_trace(go.Box(
  y = df_covid19_100_wide['확진자_아시아'], name = '아시아',
  boxpoints = "all", jitter = 0.3, pointpos = -1.8))
  
fig.add_trace(go.Box(
  y = df_covid19_100_wide['확진자_유럽'], name = '유럽',
  boxpoints = "all", jitter = 0.3, pointpos = -1.8))
  
fig.add_trace(go.Box(
  y = df_covid19_100_wide['확진자_북미'], name = '북미',
  boxpoints = "all", jitter = 0.3, pointpos = -1.8))
  
fig.add_trace(go.Box(
  y = df_covid19_100_wide['확진자_남미'], name = '남미',
  boxpoints = "all", jitter = 0.3, pointpos = -1.8))
  
fig.add_trace(go.Box(
  y = df_covid19_100_wide['확진자_아프리카'], name = '아프리카',
  boxpoints = "all", jitter = 0.3, pointpos = -1.8))
  
fig.add_trace(go.Box(
  y = df_covid19_100_wide['확진자_오세아니아'], name = '오세아니아',
  boxpoints = "all", jitter = 0.3, pointpos = -1.8))
  
fig.update_layout(title = dict(text = '한국 및 대륙별 일별 확진자 분포', x = 0.5),
                  xaxis = dict(title = '대륙명'),
                  yaxis = dict(title = '확진자수(명)'),
                  margin = dict(t = 50, b = 25, l = 25, r = 25), 
                  paper_bgcolor='lightgray', plot_bgcolor='lightgray', 
                 colorway = ("#08306B", "#08519C", "#2171B5", "#4292C6", "#6BAED6", "#9ECAE1", "#C6DBEF"))

fig.show()

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-14.pdf')

In [24]:
fig = go.Figure()

## 바이올린 trace 추가
fig.add_trace(go.Violin(
    x = df_취업률['대계열'], y = df_취업률['취업률_계'], 
    ## 바이올린 내부 박스 표시
    box = dict(visible = True),
    ## 평균 선 표시
    meanline = dict(visible = True)))

fig.update_layout(title = dict(text = '한국 및 대륙별 일별 확진자 분포', x = 0.5))

fig.show()

fig.write_image('D:\\R\\git\\datavisualization\\plotly\\RnPy\\chap3\\fig\\vector\\3-16.pdf')