# Hepatitis C data

### 1. data setting

In [1]:
# load modules
# basic
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools

## graph
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## sklearn
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

## statistics
from scipy.stats import pearsonr
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import bartlett
from scipy.stats import levene, ttest_ind
from scipy.stats import chi2_contingency

In [2]:
# load data
data = pd.read_csv('../data/hepatitisCdata.csv')

In [7]:
# use data
use_data = data.iloc[:, 1:]
use_data

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.80,74.0,15.6,76.5
2,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.20,86.0,33.2,79.3
3,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,3=Cirrhosis,62,f,32.0,416.6,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5
611,3=Cirrhosis,64,f,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3
612,3=Cirrhosis,64,f,29.0,87.3,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0
613,3=Cirrhosis,46,f,33.0,,39.0,62.0,20.0,3.56,4.20,52.0,50.0,71.0


### 2. demographic data

In [9]:
# 연속형 변수만 가져오기
data_continuous = use_data[list(use_data.dtypes[use_data.dtypes != 'object'].index)]
data_continuous.columns

Index(['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT',
       'PROT'],
      dtype='object')

In [10]:
# 범주형 변수만 가져오기
data_categorical = use_data[list(use_data.dtypes[use_data.dtypes == 'object'].index)]
data_categorical.columns

Index(['Category', 'Sex'], dtype='object')

In [11]:
# 기준점인 stage 추가
data_continuous['Category'] = data['Category']
data_continuous.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_continuous['Category'] = data['Category']


Index(['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT',
       'PROT', 'Category'],
      dtype='object')

In [12]:
# 연속형 변수 중앙값(표준오차)
data_continuous.groupby('Category').median()

Unnamed: 0_level_0,Age,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0=Blood Donor,47.0,42.2,66.7,23.1,24.8,6.9,8.35,5.405,78.0,21.4,72.2
0s=suspect Blood Donor,55.0,21.6,106.0,49.2,46.7,4.9,5.33,4.3,52.0,83.0,47.8
1=Hepatitis,37.0,43.5,34.6,15.2,47.2,13.0,9.51,5.06,72.25,45.55,73.65
2=Fibrosis,51.0,41.0,39.55,34.0,70.0,13.0,8.59,4.58,71.4,72.2,76.1
3=Cirrhosis,56.0,33.0,80.05,5.65,92.9,34.0,3.425,3.87,68.5,96.35,70.0


In [13]:
round(data_continuous.groupby('Category').std(), 2)

Unnamed: 0_level_0,Age,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0=Blood Donor,9.62,5.03,18.23,14.5,10.62,6.09,1.88,1.06,14.51,24.74,4.55
0s=suspect Blood Donor,11.07,10.56,52.82,119.84,53.32,3.11,4.64,1.87,52.85,133.77,11.71
1=Hepatitis,11.35,3.51,23.52,22.59,68.78,13.47,2.51,1.45,19.77,116.67,6.08
2=Fibrosis,11.44,3.74,8.31,66.7,42.26,5.33,1.67,0.73,12.92,47.76,5.05
3=Cirrhosis,8.91,5.82,80.05,36.36,74.64,69.37,2.24,0.99,209.56,138.04,7.93


In [14]:
# 범주변수 count
for i in data_categorical.columns:
    if i != 'Category':
        print(f'{i} 변수 결과')
        print(pd.crosstab(data[i], data['Category']))

Sex 변수 결과
Category  0=Blood Donor  0s=suspect Blood Donor  1=Hepatitis  2=Fibrosis  \
Sex                                                                        
f                   215                       1            4           8   
m                   318                       6           20          13   

Category  3=Cirrhosis  
Sex                    
f                  10  
m                  20  


In [15]:
for i in data_categorical.columns:
    if i != 'Category':
        print(f'{i} 변수 결과')
        tmp = pd.crosstab(data['Category'], data[i])
        chi2, p, dof, expected = chi2_contingency(tmp)
        msg = 'Test Statistic: {}\np-value: {}\nDegree of Freedom: {}'
        print(msg.format(round(chi2, 2), round(p, 4), dof))
        print(expected)

Sex 변수 결과
Test Statistic: 7.64
p-value: 0.1057
Degree of Freedom: 4
[[206.26666667 326.73333333]
 [  2.70894309   4.29105691]
 [  9.28780488  14.71219512]
 [  8.12682927  12.87317073]
 [ 11.6097561   18.3902439 ]]


In [24]:
# 연속형 변수
for i in data_continuous.columns:
    if i != 'Category':
        # extract data with not null values
        data2 = data_continuous[~data_continuous[i].isna()]

        # Extract data for different 'Status' categories
        lv_1 = data2[i][(data2['Category'] == '0=Blood Donor')]
        lv_2 = data2[i][(data2['Category'] == '3=Cirrhosis')]
        lv_3 = data2[i][(data2['Category'] == '1=Hepatitis')]
        lv_4 = data2[i][(data2['Category'] == '2=Fibrosis')]
        lv_5 = data2[i][(data2['Category'] == '0s=suspect Blood Donor')]

        # Perform ANOVA
        f_statistic, p_value = stats.f_oneway(lv_1, lv_2, lv_3, lv_4, lv_5)
        print(f'value : {i}')
        print(f'ANOVA f-statistic: {f_statistic:.2f}')
        print(f'ANOVA p-value: {p_value:.4f}')
        print()

value : Age
ANOVA f-statistic: 11.04
ANOVA p-value: 0.0000

value : ALB
ANOVA f-statistic: 46.83
ANOVA p-value: 0.0000

value : ALP
ANOVA f-statistic: 21.44
ANOVA p-value: 0.0000

value : ALT
ANOVA f-statistic: 27.63
ANOVA p-value: 0.0000

value : AST
ANOVA f-statistic: 115.11
ANOVA p-value: 0.0000

value : BIL
ANOVA f-statistic: 68.40
ANOVA p-value: 0.0000

value : CHE
ANOVA f-statistic: 40.68
ANOVA p-value: 0.0000

value : CHOL
ANOVA f-statistic: 17.19
ANOVA p-value: 0.0000

value : CREA
ANOVA f-statistic: 11.36
ANOVA p-value: 0.0000

value : GGT
ANOVA f-statistic: 55.60
ANOVA p-value: 0.0000

value : PROT
ANOVA f-statistic: 29.85
ANOVA p-value: 0.0000

