In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import warnings
warnings.simplefilter(action='ignore')
sns.set()
plt.style.use("ggplot")
%matplotlib inline


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold
import warnings
warnings.simplefilter(action='ignore')
sns.set()
plt.style.use("ggplot")
%matplotlib inline


In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df.shape

(744, 9)

In [4]:
df['Outcome'].value_counts() * 100 / len(df)

Outcome
0    65.456989
1    34.543011
Name: count, dtype: float64

In [5]:
df.groupby("Outcome").agg({'Pregnancies' : 'max'})

Unnamed: 0_level_0,Pregnancies
Outcome,Unnamed: 1_level_1
0,13
1,17


In [6]:
df.groupby("Outcome").agg({'Glucose':'mean'})

Unnamed: 0_level_0,Glucose
Outcome,Unnamed: 1_level_1
0,110.0
1,140.673152


In [7]:
df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']] = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']].replace(0, np.nan)

In [8]:
df.isnull().sum()

Pregnancies                 109
Glucose                       5
BloodPressure                35
SkinThickness               220
Insulin                     360
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [9]:
def median_target(var):
    temp = df[df[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

In [10]:
columns = df.columns
columns = columns.drop('Outcome')
for i in columns:
    median_target(i)
    df.loc[(df['Outcome'] == 0) & (df[i].isnull()), i] = median_target(i)[i][0]
    df.loc[(df['Outcome'] == 1) & (df[i].isnull()), i] = median_target(i)[i][1]

In [11]:
for feature in df:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    if df[(df[feature] > upper)].any(axis=None):
        print(feature, 'yes')
    else:
        print(feature, 'no')
    


Pregnancies yes
Glucose no
BloodPressure yes
SkinThickness yes
Insulin yes
BMI yes
DiabetesPedigreeFunction yes
Age yes
Outcome no


In [12]:
Q1 = df['Insulin'].quantile(0.25)
Q3 = df['Insulin'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df.loc[df['Insulin'] > upper, 'Insulin'] = upper


In [13]:
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=10)
lof.fit_predict(df)

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1

In [14]:
df_scores = lof.negative_outlier_factor_
np.sort(df_scores)[0:30]

array([-3.14573666, -2.39058304, -2.15582442, -2.10411477, -2.09315831,
       -2.01664962, -1.7933143 , -1.74927535, -1.73844651, -1.69372755,
       -1.68568328, -1.67782369, -1.67501727, -1.67270974, -1.63844674,
       -1.63592629, -1.62767323, -1.60057181, -1.59686998, -1.59018231,
       -1.58657753, -1.58113114, -1.55774169, -1.5188314 , -1.5098611 ,
       -1.50336433, -1.50117181, -1.49886673, -1.48491038, -1.48043912])

In [15]:
thresold = np.sort(df_scores)[7]

In [16]:
outlier = df_scores > thresold

In [17]:
df = df[outlier]

In [18]:
NewBMI = pd.Series(['Underweight', 'Nomarl', 'Overweight', 'Obesity 1', 'Obesity 2', 'Obesity 3'], dtype='category')


In [19]:
df['NewBMI'] = NewBMI
df.loc[df['BMI'] < 18.5, 'NewBMI'] = NewBMI[0]
df.loc[(df['BMI'] > 18.5) & df['BMI'] <= 24.9, 'NewBMI'] = NewBMI[1]
df.loc[(df['BMI'] > 24.9) & df['BMI'] <= 29.9, 'NewBMI'] = NewBMI[2]
df.loc[(df['BMI'] > 29.9) & df['BMI'] <= 34.9, 'NewBMI'] = NewBMI[3]
df.loc[(df['BMI'] > 34.9) & df['BMI'] <= 39.9, 'NewBMI'] = NewBMI[4]
df.loc[(df['BMI'] > 39.9), 'NewBMI'] = NewBMI[5]

In [20]:
def set_insuline(row):
    if row['Insulin'] >= 16 and row['Insulin'] <= 166:
        return 'Normal'
    else:
        return 'Abnormal'

In [21]:
df = df.assign(NewInsulinScore = df.apply(set_insuline, axis=1))

In [22]:
NewGlucose = pd.Series(['Low', 'Normal', 'Overweight', 'Secret', 'High'], dtype='category')
df['NewGlucose'] = NewGlucose
df.loc[df['Glucose'] <= 70, 'NewGlucose'] = NewGlucose[0]
df.loc[(df['Glucose'] > 70) & (df['Glucose'] <= 99), 'NewGlucose'] = NewGlucose[1]
df.loc[(df['Glucose'] > 99) & (df['Glucose'] <= 126), 'NewGlucose'] = NewGlucose[2]
df.loc[(df['Glucose'] > 126) , 'NewGlucose'] = NewGlucose[3]

In [23]:
#one hot encoding
df = pd.get_dummies(df, columns = ['NewBMI', 'NewInsulinScore','NewGlucose'], drop_first=True)


In [24]:
categotical_df = df[['NewBMI_Obesity 1',
                     'NewBMI_Obesity 2', 'NewBMI_Obesity 3', 'NewBMI_Overweight',
                     'NewBMI_Underweight', 'NewInsulinScore_Normal', 'NewGlucose_Low',
                     'NewGlucose_Normal', 'NewGlucose_Overweight', 'NewGlucose_Secret'
                     ]]

In [25]:
y = df['Outcome']
X = df.drop(['Outcome', 'NewBMI_Obesity 1', 
             'NewBMI_Obesity 2', 'NewBMI_Obesity 3', 'NewBMI_Overweight',
             'NewBMI_Underweight', 'NewInsulinScore_Normal', 'NewGlucose_Low',
             'NewGlucose_Normal', 'NewGlucose_Overweight', 'NewGlucose_Secret'], axis= 1)

In [26]:
cols = X.columns
index = X.index

In [27]:
from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X)
X = transformer.transform(X)
X = pd.DataFrame(X, columns= cols, index = index)

In [28]:
X = pd.concat([X, categotical_df], axis= 1)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

start train

In [31]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Khởi tạo mô hình Gradient Boosting
gbc = GradientBoostingClassifier()

# Siêu tham số để tối ưu hóa
parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],  # Điều chỉnh tỷ lệ mẫu
    'max_features': ['sqrt', 'log2', None]  # Cải thiện sự phân bố của tính năng
}

# Tạo đối tượng GridSearchCV với cross-validation
grid_search_gbc = GridSearchCV(gbc, parameters, cv=5, n_jobs=-1, verbose=2)

# Huấn luyện mô hình
grid_search_gbc.fit(X_train, y_train)

# In kết quả siêu tham số tốt nhất
print("Best parameters found: ", grid_search_gbc.best_params_)

# Đánh giá mô hình tốt nhất
best_gbc = grid_search_gbc.best_estimator_

# Dự đoán và tính độ chính xác
train_accuracy = best_gbc.score(X_train, y_train)
test_accuracy = best_gbc.score(X_test, y_test)

print(f"Training accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")


Fitting 5 folds for each of 10368 candidates, totalling 51840 fits
Best parameters found:  {'learning_rate': 0.01, 'loss': 'exponential', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150, 'subsample': 0.9}
Training accuracy: 0.9081632653061225
Test accuracy: 0.8783783783783784


In [None]:
grid_search_gbc.best_params_

{'learning_rate': 0.1,
 'loss': 'exponential',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'min_samples_split': 5,
 'n_estimators': 150,
 'subsample': 0.8}

In [None]:
grid_search_gbc.best_score_

np.float64(0.8979948516461184)

In [32]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Khởi tạo mô hình GradientBoostingClassifier với các siêu tham số đã chọn
gbc = GradientBoostingClassifier(
    learning_rate=0.1, 
    loss='exponential', 
    n_estimators=150,
    max_depth=4, 
    max_features='sqrt', 
    min_samples_leaf=3, 
    min_samples_split=5, 
    subsample=0.8
)

# Huấn luyện mô hình với dữ liệu huấn luyện
gbc.fit(X_train, y_train)

# Dự đoán kết quả trên bộ kiểm tra
y_pred = gbc.predict(X_test)

# Tính độ chính xác trên dữ liệu kiểm tra
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test accuracy: {test_accuracy}")


Test accuracy: 0.8648648648648649


In [33]:
import pickle
model = gbc
pickle.dump(model, open("diabetes_final_2.pkl", 'wb'))

end train

In [None]:
gbc = grid_search_gbc.best_estimator_
y_pred = gbc.predict(X_test)
print(accuracy_score(y_train, gbc.predict(X_train)))

gbc_acc = accuracy_score(y_test, gbc.predict(X_test))

print(accuracy_score(y_test, gbc.predict(X_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9778911564625851
0.8581081081081081
[[81 10]
 [11 46]]
              precision    recall  f1-score   support

           0       0.88      0.89      0.89        91
           1       0.82      0.81      0.81        57

    accuracy                           0.86       148
   macro avg       0.85      0.85      0.85       148
weighted avg       0.86      0.86      0.86       148



In [34]:

gbc = GradientBoostingClassifier()

parameters = {
    'loss' : ['deviance', 'exponential'],
    'learning_rate' : [0.001, 0.1, 1, 10],
    'n_estimators' : [100, 150, 180, 200]
}
grid_search_gbc = GridSearchCV(gbc, parameters, cv = 10, n_jobs = -1, verbose=1)
grid_search_gbc.fit(X_train, y_train)


Fitting 10 folds for each of 32 candidates, totalling 320 fits


In [35]:
gbc = GradientBoostingClassifier(learning_rate= 0.1, loss = 'exponential', n_estimators=150)
gbc.fit(X_train, y_train)

In [None]:
#gbc = GradientBoostingClassifier(learning_rate= 0.1, loss = 'exponential', n_estimators=150)
#gbc.fit(X_train, y_train)

In [36]:
import pickle
model = gbc
pickle.dump(model, open("diabetes_final_3.pkl", 'wb'))