In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
diabetes_dataset = pd.read_csv("diabetes.csv")

In [3]:
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [4]:
scaler = StandardScaler()
scaler.fit(X)
standarized_data = scaler.transform(X)


In [5]:
X = standarized_data
Y = diabetes_dataset['Outcome']

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [7]:
classifier = svm.SVC(kernel='linear')

In [8]:
classifier.fit(X_train, Y_train)

In [9]:
import pickle
filename = 'diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [10]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [11]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [14]:
input_data = (10,108,66,0,0,32.4,0.272,42)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[ 1.82781311 -0.40356202 -0.16054575 -1.28821221 -0.69289057  0.05170968
  -0.60365015  0.74529338]]
[0]
The person is not diabetic




In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv("diabetes.csv")

# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# standardizing the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# defining the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# creating a GridSearchCV object
grid = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train, Y_train)

# best parameters
print("Best parameters found: ", grid.best_params_)

# using the best estimator
best_classifier = grid.best_estimator_

# accuracy score on the training data
X_train_prediction = best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

# accuracy score on the test data
X_test_prediction = best_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

# saving the best model
filename = 'best_diabetes_model.sav'
pickle.dump(best_classifier, open(filename, 'wb'))

# loading the saved model
loaded_model = pickle.load(open('best_diabetes_model.sav', 'rb'))

# predicting with the loaded model
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1



In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv("diabetes.csv")

# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# standardizing the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# defining the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# creating a GridSearchCV object
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, verbose=2)
grid.fit(X_train, Y_train)

# best parameters
print("Best parameters found: ", grid.best_params_)

# using the best estimator
best_classifier = grid.best_estimator_

# accuracy score on the training data
X_train_prediction = best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

# accuracy score on the test data
X_test_prediction = best_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

# saving the best model
filename = 'best_diabetes_model_1.sav'
pickle.dump(best_classifier, open(filename, 'wb'))

# loading the saved model
loaded_model = pickle.load(open('best_diabetes_model_1.sav', 'rb'))

# predicting with the loaded model
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END m



In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle

# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv("diabetes.csv")

# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# standardizing the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# defining the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False]
}

# creating a GridSearchCV object with cross-validation
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, refit=True, verbose=2)
grid.fit(X_train, Y_train)

# best parameters
print("Best parameters found: ", grid.best_params_)

# using the best estimator
best_classifier = grid.best_estimator_

# accuracy score on the training data
X_train_prediction = best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

# cross-validated accuracy score on the test data
cv_scores = cross_val_score(best_classifier, X_test, Y_test, cv=5)
print('Cross-validated accuracy score of the test data : ', cv_scores.mean())

# saving the best model
filename = 'best_diabetes_model_2.sav'
pickle.dump(best_classifier, open(filename, 'wb'))

# loading the saved model
loaded_model = pickle.load(open('best_diabetes_model_2.sav', 'rb'))

# predicting with the loaded model
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)
prediction = loaded_model.predict(input_data_reshaped)

print(prediction)
if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tot



In [20]:
loaded_model = pickle.load(open('best_diabetes_model_2.sav', 'rb'))

# predicting with the loaded model
input_data = (0, 114, 80, 34, 285, 44.2, 0.167, 27)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)
prediction = loaded_model.predict(input_data_reshaped)

print(prediction)
if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


[0]
The person is not diabetic




In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle
from scipy.stats import randint

# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv("diabetes.csv")

# Checking for missing data
print(diabetes_dataset.isnull().sum())

# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# standardizing the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# defining the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2', None]  # Fixing max_features parameter
}

# creating a RandomizedSearchCV object with cross-validation
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fitting the RandomizedSearchCV to training data
random_search.fit(X_train, Y_train)

# best parameters from the random search
print("Best parameters found: ", random_search.best_params_)

# using the best estimator from RandomizedSearchCV
best_classifier = random_search.best_estimator_

# accuracy score on the training data
X_train_prediction = best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

# cross-validated accuracy score on the test data
cv_scores = cross_val_score(best_classifier, X_test, Y_test, cv=5)
print('Cross-validated accuracy score of the test data : ', cv_scores.mean())

# saving the best model
filename = 'best_diabetes_model_optimized.sav'
pickle.dump(best_classifier, open(filename, 'wb'))

# loading the saved model
loaded_model = pickle.load(open('best_diabetes_model_optimized.sav', 'rb'))

# predicting with the loaded model
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)
prediction = loaded_model.predict(input_data_reshaped)

print(prediction)
if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'bootstrap': False, 'max_depth': 40, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 19, 'n_estimators': 507}
Accuracy score of the training data :  0.9364820846905537
Cross-validated accuracy score of the test data :  0.727741935483871
[1]
The person is diabetic




In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Tối ưu hóa siêu tham số với GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False]
}

grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, refit=True, verbose=2)
grid.fit(X_train, Y_train)

# In ra các tham số tốt nhất
print("Best parameters found: ", grid.best_params_)

# Sử dụng mô hình tốt nhất
best_classifier = grid.best_estimator_

# Tính độ chính xác trên dữ liệu huấn luyện
X_train_prediction = best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data: ', training_data_accuracy)

# Tính độ chính xác trên dữ liệu kiểm tra với Cross-validation
cv_scores = cross_val_score(best_classifier, X_test, Y_test, cv=5)
print('Cross-validated accuracy score on test data: ', cv_scores.mean())

# Lưu mô hình
filename = 'best_diabetes_model_rf.sav'
pickle.dump(best_classifier, open(filename, 'wb'))

# Dự đoán với mô hình đã lưu
loaded_model = pickle.load(open('best_diabetes_model_rf.sav', 'rb'))

input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)  # Ví dụ dữ liệu
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)
prediction = loaded_model.predict(input_data_reshaped)

print(prediction)
if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tot



In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Khởi tạo mô hình Gradient Boosting
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.05)
gb_classifier.fit(X_train, Y_train)

# Tính độ chính xác trên dữ liệu huấn luyện
Y_train_pred = gb_classifier.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {training_accuracy}")

# Tính độ chính xác trên dữ liệu kiểm tra
Y_test_pred = gb_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Lưu mô hình
filename = 'best_diabetes_model_gb.sav'
pickle.dump(gb_classifier, open(filename, 'wb'))

# Dự đoán với mô hình đã lưu
loaded_model = pickle.load(open('best_diabetes_model_gb.sav', 'rb'))

input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)  # Ví dụ dữ liệu
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)
prediction = loaded_model.predict(input_data_reshaped)

print(prediction)
if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


Training Accuracy: 0.8811074918566775
Test Accuracy: 0.7337662337662337
[1]
The person is diabetic




In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Khởi tạo mô hình XGBoost
xgb_classifier = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_classifier.fit(X_train, Y_train)

# Tính độ chính xác trên dữ liệu huấn luyện
Y_train_pred = xgb_classifier.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {training_accuracy}")

# Tính độ chính xác trên dữ liệu kiểm tra
Y_test_pred = xgb_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Lưu mô hình
filename = 'best_diabetes_model_xgb.sav'
pickle.dump(xgb_classifier, open(filename, 'wb'))

# Dự đoán với mô hình đã lưu
loaded_model = pickle.load(open('best_diabetes_model_xgb.sav', 'rb'))

input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)  # Ví dụ dữ liệu
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)
prediction = loaded_model.predict(input_data_reshaped)

print(prediction)
if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


Training Accuracy: 0.8892508143322475
Test Accuracy: 0.7402597402597403
[1]
The person is diabetic




In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Khởi tạo mô hình LightGBM
lgb_classifier = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
lgb_classifier.fit(X_train, Y_train)

# Tính độ chính xác trên dữ liệu huấn luyện
Y_train_pred = lgb_classifier.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {training_accuracy}")

# Tính độ chính xác trên dữ liệu kiểm tra
Y_test_pred = lgb_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Lưu mô hình
filename = 'best_diabetes_model_lgb.sav'
pickle.dump(lgb_classifier, open(filename, 'wb'))

# Dự đoán với mô hình đã lưu
loaded_model = pickle.load(open('best_diabetes_model_lgb.sav', 'rb'))

input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)  # Ví dụ dữ liệu
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_reshaped = scaler.transform(input_data_reshaped)
prediction = loaded_model.predict(input_data_reshaped)

print(prediction)
if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')


[LightGBM] [Info] Number of positive: 214, number of negative: 400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 680
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348534 -> initscore=-0.625489
[LightGBM] [Info] Start training from score -0.625489
Training Accuracy: 0.8957654723127035
Test Accuracy: 0.7597402597402597
[1]
The person is diabetic




In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Khởi tạo và huấn luyện mô hình Logistic Regression
logreg_classifier = LogisticRegression(random_state=2, solver='liblinear')
logreg_classifier.fit(X_train, Y_train)

# Dự đoán và tính độ chính xác
Y_train_pred = logreg_classifier.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {training_accuracy}")

Y_test_pred = logreg_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Lưu mô hình
filename = 'best_diabetes_model_logreg.sav'
pickle.dump(logreg_classifier, open(filename, 'wb'))


Training Accuracy: 0.7833876221498371
Test Accuracy: 0.7597402597402597


In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Khởi tạo và huấn luyện mô hình SVM
svm_classifier = SVC(kernel='rbf', gamma='scale', random_state=2)
svm_classifier.fit(X_train, Y_train)

# Dự đoán và tính độ chính xác
Y_train_pred = svm_classifier.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {training_accuracy}")

Y_test_pred = svm_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Lưu mô hình
filename = 'best_diabetes_model_svm.sav'
pickle.dump(svm_classifier, open(filename, 'wb'))


Training Accuracy: 0.8289902280130294
Test Accuracy: 0.7272727272727273


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Khởi tạo và huấn luyện mô hình KNN
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, Y_train)

# Dự đoán và tính độ chính xác
Y_train_pred = knn_classifier.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {training_accuracy}")

Y_test_pred = knn_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Lưu mô hình
filename = 'best_diabetes_model_knn.sav'
pickle.dump(knn_classifier, open(filename, 'wb'))


Training Accuracy: 0.8289902280130294
Test Accuracy: 0.7207792207792207


In [6]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Khởi tạo và huấn luyện mô hình MLP
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=1000, random_state=2)
mlp_classifier.fit(X_train, Y_train)

# Dự đoán và tính độ chính xác
Y_train_pred = mlp_classifier.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {training_accuracy}")

Y_test_pred = mlp_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Lưu mô hình
filename = 'best_diabetes_model_mlp.sav'
pickle.dump(mlp_classifier, open(filename, 'wb'))


Training Accuracy: 0.9690553745928339
Test Accuracy: 0.6623376623376623


In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

# Tải dữ liệu
diabetes_dataset = pd.read_csv("diabetes.csv")

# Tách dữ liệu và nhãn
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia dữ liệu thành train và test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Khởi tạo và huấn luyện mô hình AdaBoost
adaboost_classifier = AdaBoostClassifier(n_estimators=100, random_state=2)
adaboost_classifier.fit(X_train, Y_train)

# Dự đoán và tính độ chính xác
Y_train_pred = adaboost_classifier.predict(X_train)
training_accuracy = accuracy_score(Y_train, Y_train_pred)
print(f"Training Accuracy: {training_accuracy}")

Y_test_pred = adaboost_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Lưu mô hình
filename = 'best_diabetes_model_adaboost.sav'
pickle.dump(adaboost_classifier, open(filename, 'wb'))




Training Accuracy: 0.8566775244299675
Test Accuracy: 0.7402597402597403


In [4]:
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

# Bước 1: Tải mô hình đã lưu
filename = 'best_diabetes_model_2.sav'  # Đảm bảo tên file đúng
loaded_model = pickle.load(open(filename, 'rb'))

# Bước 2: Nhập dữ liệu từ người dùng (thủ công)
# Giả sử các đặc trưng của bệnh tiểu đường: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age
print("Vui lòng nhập các giá trị sau:")
pregnancies = float(input("Số lần mang thai: "))
glucose = float(input("Lượng glucose: "))
blood_pressure = float(input("Huyết áp: "))
skin_thickness = float(input("Độ dày da: "))
insulin = float(input("Lượng insulin: "))
bmi = float(input("Chỉ số BMI: "))
diabetes_pedigree = float(input("Diabetes Pedigree Function: "))
age = float(input("Tuổi: "))

# Tạo mảng đầu vào từ dữ liệu đã nhập
input_data = np.array([pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, diabetes_pedigree, age])

# Bước 3: Tiền xử lý (chuẩn hóa dữ liệu nếu cần thiết)
scaler = StandardScaler()
input_data_reshaped = input_data.reshape(1, -1)  # Chuyển thành mảng 2 chiều (1 mẫu, nhiều đặc trưng)
input_data_scaled = scaler.fit_transform(input_data_reshaped)  # Chuẩn hóa dữ liệu

# Bước 4: Dự đoán với mô hình đã tải
prediction = loaded_model.predict(input_data_scaled)

# Bước 5: Hiển thị kết quả
if prediction[0] == 0:
    print("Người này không bị tiểu đường.")
else:
    print("Người này bị tiểu đường.")


NameError: name 'scaler' is not defined