In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings

In [2]:
data = pd.read_csv('../data/diabetes_012_health_indicators_BRFSS2015.csv')

## 1. Предобработка данных.

In [3]:
data["Diabetes_012"] = data["Diabetes_012"].astype(int)
data["HighBP"] = data["HighBP"].astype(int)
data["HighChol"] = data["HighChol"].astype(int)
data["CholCheck"] = data["CholCheck"].astype(int)
data["BMI"] = data["BMI"].astype(int)
data["Smoker"] = data["Smoker"].astype(int)
data["Stroke"] = data["Stroke"].astype(int)
data["HeartDiseaseorAttack"] = data["HeartDiseaseorAttack"].astype(int)
data["PhysActivity"] = data["PhysActivity"].astype(int)
data["Fruits"] = data["Fruits"].astype(int) 
data["Veggies"] = data["Veggies"].astype(int)
data["HvyAlcoholConsump"] = data["HvyAlcoholConsump"].astype(int)
data["AnyHealthcare"] = data["AnyHealthcare"].astype(int)
data["NoDocbcCost"] = data["NoDocbcCost"].astype(int)
data["GenHlth"] = data["GenHlth"].astype(int)
data["PhysHlth"] = data["PhysHlth"].astype(int)
data["DiffWalk"] = data["DiffWalk"].astype(int)
data["Sex"] = data["Sex"].astype(int)
data["Age"] = data["Age"].astype(int)
data["Education"] = data["Education"].astype(int)
data["Income"] =data["Income"].astype(int)

In [4]:
data.isna().sum()

Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [5]:
duplicates = data[data.duplicated()]
duplicates.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23899 entries, 1242 to 253638
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Diabetes_012          23899 non-null  int64  
 1   HighBP                23899 non-null  int64  
 2   HighChol              23899 non-null  int64  
 3   CholCheck             23899 non-null  int64  
 4   BMI                   23899 non-null  int64  
 5   Smoker                23899 non-null  int64  
 6   Stroke                23899 non-null  int64  
 7   HeartDiseaseorAttack  23899 non-null  int64  
 8   PhysActivity          23899 non-null  int64  
 9   Fruits                23899 non-null  int64  
 10  Veggies               23899 non-null  int64  
 11  HvyAlcoholConsump     23899 non-null  int64  
 12  AnyHealthcare         23899 non-null  int64  
 13  NoDocbcCost           23899 non-null  int64  
 14  GenHlth               23899 non-null  int64  
 15  MentHlth            

In [6]:
data.drop_duplicates(inplace = True)

## 2. Разбиение на обучающую и тестовую выборки.

In [7]:
y = data["Diabetes_012"]
X = data.drop(["Diabetes_012"], axis=1)

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [11]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [12]:
X[y==0].shape

(190055, 21)

In [13]:
X[y==1].shape

(4629, 21)

In [14]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_res, y_res = rus.fit_resample(X, y)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_res,y_res,test_size=0.2,random_state=0)

## 3. Обучение моделей.

### ML1. Naive Bayes.

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, classification_report

In [23]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
predictions_gnb= gnb.predict(X_test)
accuracy_score(y_test, predictions_gnb)

0.476601871850252

In [24]:
predictions_gnb= gnb.predict(X_test)
print(classification_report(y_test, predictions_gnb))
print('Predicted labels: ', predictions_gnb)

              precision    recall  f1-score   support

           0       0.49      0.66      0.57       878
           1       0.41      0.13      0.20       956
           2       0.48      0.65      0.55       944

    accuracy                           0.48      2778
   macro avg       0.46      0.48      0.44      2778
weighted avg       0.46      0.48      0.43      2778

Predicted labels:  [0 0 2 ... 2 2 1]


### ML2. KMeans.

In [25]:
from sklearn.cluster import KMeans

In [72]:
kmeans = KMeans(n_clusters=3, random_state=100)

In [76]:
data_c = scaler.fit_transform(data)
kmeans_fit = kmeans.fit(data_c)

  super()._check_params_vs_input(X, default_n_init=10)


In [85]:
kmeans_fit.cluster_centers_

array([[ 0.56993087,  0.4654769 ,  0.33931985,  0.096404  ,  0.40802286,
         0.29719638,  0.51650313,  0.56337381, -0.6430796 , -0.16061844,
        -0.24210088, -0.11384848, -0.07811914,  0.45785618,  1.25605876,
         0.85368465,  1.46650668,  1.49470391, -0.15791768,  0.30116078,
        -0.55987115, -0.8775252 ],
       [ 0.20791379,  0.712017  ,  0.43093571,  0.17213617,  0.09713792,
         0.10156407, -0.01948432,  0.11178221,  0.03093302,  0.02205819,
        -0.00455621, -0.02860949,  0.14969538, -0.20143794,  0.00423124,
        -0.27404566, -0.30471903, -0.2221605 ,  0.15267294,  0.55020885,
        -0.03848094,  0.0517552 ],
       [-0.38887673, -0.7520662 , -0.47763873, -0.17552791, -0.23696827,
        -0.19729125, -0.18582979, -0.30929111,  0.22602246,  0.04496826,
         0.09807294,  0.06732642, -0.08948052, -0.01716104, -0.49326782,
        -0.11335961, -0.32778932, -0.40493905, -0.06074403, -0.55832741,
         0.24919021,  0.30077456]])

In [79]:
pr = kmeans_fit.predict(data_c)

In [86]:
kmeans_fit.inertia_

4284406.7390978895

In [83]:
from sklearn.metrics import silhouette_score, silhouette_samples

In [87]:
silhouette_score(data_c, kmeans.labels_)

0.06966353131034572

### ML3. Gradient Boosting.

In [92]:
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier

In [93]:
base_class = DecisionTreeClassifier()
gradient_classifier = GradientBoostingClassifier(learning_rate=0.1)

In [95]:
gradient_classifier.fit(X_train, y_train)

In [96]:
y_pred_gc = gradient_classifier.predict(X_test)

In [97]:
print(classification_report(y_test, y_pred_gc))

              precision    recall  f1-score   support

           0       0.57      0.57      0.57       878
           1       0.39      0.32      0.35       956
           2       0.50      0.59      0.54       944

    accuracy                           0.49      2778
   macro avg       0.49      0.49      0.49      2778
weighted avg       0.49      0.49      0.49      2778



### ML4. Bagging.

In [98]:
bagging_classifier = BaggingClassifier(base_class, n_estimators = 10)

In [99]:
bagging_classifier.fit(X_train, y_train)

In [100]:
y_pred_bc = bagging_classifier.predict(X_test)

In [101]:
print(classification_report(y_test, y_pred_bc))

              precision    recall  f1-score   support

           0       0.49      0.54      0.51       878
           1       0.40      0.39      0.39       956
           2       0.48      0.45      0.46       944

    accuracy                           0.46      2778
   macro avg       0.46      0.46      0.46      2778
weighted avg       0.46      0.46      0.46      2778



### ML5. Stacking.

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

In [106]:
base_models_class = [
    ('gnb', GaussianNB()),
    ('LogisticRegression', LogisticRegression(C=0.1, penalty='l1', solver='liblinear', max_iter=200, class_weight='balanced'))
]

In [107]:
stacking_classifier = StackingClassifier(estimators=base_models_class, final_estimator=DecisionTreeClassifier())

In [108]:
stacking_classifier.fit(X_train, y_train)

In [109]:
y_pred_stacking_classifier = stacking_classifier.predict(X_test)

In [110]:
print(classification_report(y_test, y_pred_stacking_classifier))

              precision    recall  f1-score   support

           0       0.47      0.51      0.49       878
           1       0.35      0.33      0.34       956
           2       0.42      0.41      0.41       944

    accuracy                           0.41      2778
   macro avg       0.41      0.42      0.41      2778
weighted avg       0.41      0.41      0.41      2778

