In [1]:
!unzip diabetes_dataset.zip

Archive:  diabetes_dataset.zip
  inflating: diabetes.csv            


In [183]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import numpy as np

In [3]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
scatter_matrix(df)

In [4]:
df['InsulinInUse'] = df['Insulin'] > 0
df['SkinThicknessNoZeros'] = df['SkinThickness'].copy()
df.loc[df['SkinThicknessNoZeros'] == 0, 'SkinThicknessNoZeros'] = None
df['IsSkinThicknessZero'] = df['SkinThickness'] == 0

In [5]:
df['Outcome'].mean()

0.3489583333333333

In [6]:

num_features = ['Pregnancies', 'Glucose', 'SkinThickness', 'BMI', 'Age']
cat_features = []



In [325]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [326]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler()),
])

pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", OneHotEncoder(), cat_features)
])



X_train = pipeline.fit_transform(train_df)
Y_train = train_df['Outcome'].values

X_test = pipeline.fit_transform(test_df)
Y_test = test_df['Outcome'].values


In [327]:

def get_acc(model, use_kmeans=True, k=10, ):

    if use_kmeans:
        kmeans = KMeans(n_clusters=k, n_init=10)
        X_clusters = kmeans.fit_transform(X_train)
        X_repr_i = np.argmin(X_clusters, axis=0)
    else:
        X_repr_i = np.random.choice(np.arange(len(X_train)), size=k, replace=False)

    X_repr = X_train[X_repr_i]
    Y_repr = Y_train[X_repr_i]
    #try:
    model.fit(X_repr, Y_repr)
    Y_hat = model.predict(X_test)
    #except Exception:
     #   print('failed')
     #   return 0
    return ((Y_hat >= 0.5) == Y_test).mean()

In [328]:

n = 100
scores_kmeans = np.array([get_acc(LinearRegression(), True) for _ in range(n)])
scores_random = np.array([get_acc(LinearRegression(), False) for _ in range(n)])
print('Labeling 10 data points from kmeans accuracy:', scores_kmeans.mean())
print('Labeling 10 data points randomly accuracy:', scores_random.mean())

Labeling 10 data points from kmeans accuracy: 0.7009740259740258
Labeling 10 data points randomly accuracy: 0.6612987012987013


In [329]:
model = LogisticRegression()
model.fit(X_train, Y_train)
f'Accuracy (only LogisticRegression): {model.score(X_test, Y_test)}'

'Accuracy (only LogisticRegression): 0.8051948051948052'

In [331]:
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=10)),
    ("log_reg", LogisticRegression(max_iter=1000)),
])
pipeline.fit(X_train, Y_train)
pipeline.score(X_test, Y_test)
f'Accuracy (KMeans + LogisticRegression): {pipeline.score(X_test, Y_test)}'



'Accuracy (KMeans + LogisticRegression): 0.8181818181818182'