In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import joblib
import numpy as np

import json

In [2]:
# Load the dataset
df = pd.read_csv('data/diabetes.csv')

# Separate features and target variable
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with StandardScaler and SVC classifier
pipeline = make_pipeline(StandardScaler(), LogisticRegression())

# Perform cross-validation on the training data
# Here, we're using accuracy as the scoring metric
# You can change the scoring metric as per your requirements
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

# Now, you can fit the pipeline on the whole training data
pipeline.fit(X_train, y_train)

# Evaluate the model on the test set
test_score = pipeline.score(X_test, y_test)
print("Test set accuracy:", test_score)

Cross-validation scores: [0.74796748 0.80487805 0.73170732 0.73170732 0.78688525]
Mean CV accuracy: 0.7606290817006529
Test set accuracy: 0.7532467532467533


In [3]:
joblib.dump(pipeline, './model/logistic_regression_model.pkl')
model  = joblib.load('./model/logistic_regression_model.pkl')

In [4]:
test_score = model.score(X_test, y_test)
print("Test set accuracy:", test_score)

Test set accuracy: 0.7532467532467533


In [5]:
data_new = X_test.iloc[[0]]
model.predict_proba(data_new)

array([[0.72263617, 0.27736383]])

In [20]:
json_data = '{"Pregnancies": "1", "Glucose": "100", "BloodPressure": "60", "SkinThickness": "30", "Insulin": "200", "BMI": "30", "DiabetesPedigreeFunction": "0.4", "Age": "40"}'

data = json.loads(json_data)
data_df = pd.DataFrame(data, index=[0])

model = joblib.load('./model/logistic_regression_model.pkl')

prediction = model.predict_proba(data_df)
print(prediction)

[[0.85785548 0.14214452]]


In [22]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 0 to 0
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Pregnancies               1 non-null      object
 1   Glucose                   1 non-null      object
 2   BloodPressure             1 non-null      object
 3   SkinThickness             1 non-null      object
 4   Insulin                   1 non-null      object
 5   BMI                       1 non-null      object
 6   DiabetesPedigreeFunction  1 non-null      object
 7   Age                       1 non-null      object
dtypes: object(8)
memory usage: 72.0+ bytes


In [15]:
data

{'Pregnancies': 1,
 'Glucose': 100,
 'BloodPressure': 60,
 'SkinThickness': 30,
 'Insulin': 200,
 'BMI': 30,
 'DiabetesPedigreeFunction': 0.4,
 'Age': 40}

In [14]:
prediction[0].tolist()

[0.8578554813867604, 0.1421445186132396]

In [None]:
model = joblib.load('./model/logistic_regression_model.pkl')