In [1]:
# BASIC TOOLS
import numpy as np
import pandas as pd
import datetime as dt

# STATISTIC TOOLS
import matplotlib.pyplot as plt 
import seaborn as sns
import scipy.stats as stats
import pingouin as pg 
import statsmodels.formula.api as smf
from lifelines import KaplanMeierFitter, NelsonAalenFitter, CoxPHFitter

# PRE - PROCESSING TOOLS 
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# SUPERVISED LEARNING TOOLS
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score

# UNSUPERVISED LEARNING TOOLS
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.manifold import TSNE 
from sklearn.decomposition import PCA, NMF

# DEEP LEARNING TOOLS 
from sklearn.neural_network import MLPRegressor, MLPClassifier

  return warn(


# IMPORT DATA

In [2]:
xls = pd.ExcelFile('D:\\Users\\Desktop\\โครงการ\\CommunityProject\\CommunityProject.xlsx')
print(xls.sheet_names)

['dependent', 'diagnosis', 'Example']


In [3]:
comdiag = xls.parse('diagnosis')
comdiag = comdiag.iloc[:, 0:]
comdiag

Unnamed: 0,ID,fatigue,resistance,ambulation,multicomorbd,weightloss,prefrail,frail,gender,age,...,bmi,previousfall,alone,exercise,polypharmacy,smoking,alcohol,underlying,knowfrail,frailimpact
0,0,0,1,1,0,0,1,0,m,75,...,21.48,1,0,1,0,no,no,"[ht,dlp]",1,extreme
1,1,0,0,0,0,0,0,0,m,75,...,21.3,0,0,0,0,current,current,"[ht,dlp]",0,high
2,2,0,1,0,0,1,1,0,f,66,...,27.56,0,0,0,0,no,no,"[dm,ht,dlp]",0,extreme
3,3,0,1,0,0,0,1,0,m,63,...,31.53,0,0,0,1,no,previous,"[dm,ht,dlp,gout]",0,high
4,4,0,0,0,0,0,0,0,f,84,...,28.8,0,0,0,0,no,no,[ht],0,extreme
5,5,0,0,0,0,0,0,0,f,71,...,30.1,0,0,0,0,no,no,[ht],1,moderate
6,6,0,0,0,0,0,0,0,f,67,...,19.83,0,0,0,0,no,no,[no],1,moderate
7,7,0,0,0,0,0,0,0,m,67,...,20.0,1,0,1,0,current,current,[no],1,moderate
8,8,0,1,0,0,0,1,0,m,64,...,29.4,0,0,0,0,previous,previous,"[dm,ht,dlp,gad]",1,moderate
9,9,0,0,0,0,0,0,0,f,66,...,26.56,0,0,0,0,no,no,"[ht,dlp]",0,extreme


# UNDERSTAND THE DATA

In [4]:
comdiag.info()
comdiag.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             54 non-null     int64  
 1   fatigue        54 non-null     int64  
 2   resistance     54 non-null     int64  
 3   ambulation     54 non-null     int64  
 4   multicomorbd   54 non-null     int64  
 5   weightloss     54 non-null     int64  
 6   prefrail       54 non-null     int64  
 7   frail          54 non-null     int64  
 8   gender         54 non-null     object 
 9   age            54 non-null     int64  
 10  problem        54 non-null     object 
 11  problemimpact  54 non-null     object 
 12  bw             54 non-null     float64
 13  ht             54 non-null     int64  
 14  bmi            54 non-null     float64
 15  previousfall   54 non-null     int64  
 16  alone          54 non-null     int64  
 17  exercise       54 non-null     int64  
 18  polypharmacy

ID               0
fatigue          0
resistance       0
ambulation       0
multicomorbd     0
weightloss       0
prefrail         0
frail            0
gender           0
age              0
problem          0
problemimpact    0
bw               0
ht               0
bmi              0
previousfall     0
alone            0
exercise         0
polypharmacy     0
smoking          0
alcohol          0
underlying       0
knowfrail        0
frailimpact      0
dtype: int64

# DATA PREPARATION

find common comorbidities

In [5]:
commonunderlying = comdiag['underlying'].explode().value_counts(normalize = True)
commonunderlying

[dm,ht,dlp]                                0.240741
[ht,dlp]                                   0.129630
[dm,ht]                                    0.092593
[ht]                                       0.074074
[no]                                       0.074074
[ht,dlp,gad]                               0.055556
[ht,dlp,osteoporosis,gad,schizophrenia]    0.018519
[dm,dlp,ckd]                               0.018519
[dm,dlp]                                   0.018519
[dm.dlp]                                   0.018519
[dm,ht,dementia,depression]                0.018519
[dm,ht,dlp,ckd]                            0.018519
[ht,dlp,ckd]                               0.018519
[ht,dlp,ckd,dementia,parkinsonism]         0.018519
[dm,ht,dlp,dyspepsia]                      0.018519
[dm]                                       0.018519
[dm,ht,dlp,stroke]                         0.018519
[dm,ht,dlp,osteoporosis]                   0.018519
[ht,dlp,osteoporosis,gad]                  0.018519
[dm,ht,dlp,h

feature engineering

In [6]:
conditions = ['dm','dlp','ht','ckd','stroke','dementia','depression','anxiety','oa','osteoporosis'] # เลือก conditions ที่เราสนใจที่จะนำมาวิเคราะห์ หรือสร้างโมเดล
for i in conditions :
    comdiag[i] = np.where(comdiag['underlying'].str.contains(i), 1, 0)
comdiag.loc[:, ['dm','dlp','ht','ckd','stroke','dementia','depression','anxiety','oa','osteoporosis']].head(10)

Unnamed: 0,dm,dlp,ht,ckd,stroke,dementia,depression,anxiety,oa,osteoporosis
0,0,1,1,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0
8,1,1,1,0,0,0,0,0,0,0
9,0,1,1,0,0,0,0,0,0,0


# EXPLORATORY DATA ANALYSIS

นับปัญหาที่พบบ่อยในชุมชน จาก frailty screening & comorbidities

In [7]:
problems = comdiag.loc[:, ['frail', 'prefrail','dm','dlp','ht','ckd','stroke','dementia','depression','anxiety','oa','osteoporosis']].sum()
print('ดังนั้น จากการสำรวจปัญหาที่พบในชุมชน พบว่าภาวะที่พบบ่อยที่สุด 4 อันดับได้แก่')
problems.sort_values(ascending = False) * 100 / len(comdiag) 


ดังนั้น จากการสำรวจปัญหาที่พบในชุมชน พบว่าภาวะที่พบบ่อยที่สุด 4 อันดับได้แก่


ht              85.185185
dlp             72.222222
dm              59.259259
prefrail        50.000000
frail           11.111111
ckd              7.407407
osteoporosis     5.555556
dementia         3.703704
stroke           1.851852
depression       1.851852
oa               1.851852
anxiety          0.000000
dtype: float64

ความรุนแรงของปัญหา : columns "problemimpact" & "frailimpact"

In [8]:
# เลือก problem ที่ตรงกับ 4 อันดับของ underlying
topNCDs = comdiag[comdiag['problem'].isin(['ht','dlp','dm'])]
topNCDs.groupby('problem')['problemimpact'].value_counts()

problem  problemimpact
dm       very             8
         moderate         7
         extreme          3
         low              3
ht       moderate         4
         very             4
         low              2
Name: problemimpact, dtype: int64