In [None]:
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.cluster import AgglomerativeClustering
import warnings
warnings.filterwarnings("ignore")
%run PRE_PROCESSING_DIABETES_DATA.ipynb

In [None]:
scaler = MinMaxScaler()
scaled_cluster_df = scaler.fit_transform(cluster_df)

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(scaled_cluster_df)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])

In [None]:
km=KMeans(n_clusters=2).fit(principalDf.iloc[:,:2])
predictions=km.predict(principalDf.iloc[:,:2])
principalDf['Label']=predictions
principalDf

In [None]:
centroid=km.cluster_centers_
centroid

In [None]:
km_cluster_0=principalDf[principalDf['Label']==0]
km_cluster_1=principalDf[principalDf['Label']==1]
plt.scatter(km_cluster_0['principal component 1'],km_cluster_0['principal component 2'],color='yellow',label='class 0')
plt.scatter(km_cluster_1['principal component 1'],km_cluster_1['principal component 2'],color='orange',label='class 1')
plt.scatter(centroid[:,0],centroid[:,1],color='black',label='centroid')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA-KMeans Clustering')
plt.legend(loc='upper center')
plt.show()

In [None]:
original_centroids_scaled=pca.inverse_transform(km.cluster_centers_)
original_centroid_unscaled=scaler.inverse_transform(original_centroids_scaled)
original_data_scaled=pca.inverse_transform(np.array(principalDf.iloc[:,:2].values))
original_data_unscaled=scaler.inverse_transform(original_data_scaled).tolist()

In [None]:
original_centroid_unscaled

In [None]:
Regular_insulin_dose=[]
NPH_insulin_dose=[]
UltraLente_insulin_dose=[]
Pre_supper_glucose_measurement=[]
Post_supper_glucose_measurement=[]
Pre_breakfast_glucose_measurement=[]
net_list=[Regular_insulin_dose,NPH_insulin_dose,UltraLente_insulin_dose,Pre_supper_glucose_measurement,Post_supper_glucose_measurement,Pre_breakfast_glucose_measurement]
for i in range(len(original_data_unscaled)):
    for j in range(len(original_data_unscaled[i])):
        net_list[j].append(original_data_unscaled[i][j])
regression_df=pd.DataFrame({'Regular_insulin_dose':Regular_insulin_dose,'NPH_insulin_dose':NPH_insulin_dose,'UltraLente_insulin_dose':UltraLente_insulin_dose,'Pre_supper_glucose_measurement':Pre_supper_glucose_measurement,'Post_supper_glucose_measurement':Post_supper_glucose_measurement,'Pre_breakfast_glucose_measurement':Pre_breakfast_glucose_measurement,'Output':principalDf.iloc[:,2]})
regression_df

In [None]:
silhouette = silhouette_score(principalDf.iloc[:,:2], km.labels_)
db_index = davies_bouldin_score(principalDf.iloc[:,:2], km.labels_)
ch_index = calinski_harabasz_score(principalDf.iloc[:,:2], km.labels_)

print(f"Silhouette Score: {silhouette:.2f}")
print(f"Davies-Bouldin Index: {db_index:.2f}")
print(f"Calinski-Harabasz Index: {ch_index:.2f}")

In [None]:
inertia = []
K_range = range(1, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(principalDf.iloc[:,:2])
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 6))
plt.plot(K_range, inertia, marker='o', linestyle='-')
plt.title('Elbow Plot for KMeans Clustering')
plt.xlabel('Number of clusters ')
plt.ylabel('Inertia')
plt.show()

In [None]:
clusterer = AgglomerativeClustering(n_clusters = 2, linkage='average')
cluster_labels = clusterer.fit_predict(principalDf.iloc[:,:2])

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(principalDf.iloc[:, 0], principalDf.iloc[:, 1], c=cluster_labels, cmap='viridis', s=50)
plt.title('Agglomerative Clustering')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
silhouette = silhouette_score(principalDf.iloc[:,:2], clusterer.labels_)
db_index = davies_bouldin_score(principalDf.iloc[:,:2], clusterer.labels_)
ch_index = calinski_harabasz_score(principalDf.iloc[:,:2], clusterer.labels_)

print(f"Silhouette Score: {silhouette:.2f}")
print(f"Davies-Bouldin Index: {db_index:.2f}")
print(f"Calinski-Harabasz Index: {ch_index:.2f}")

Note: Silhouette scores are between -1 and +1 , if the value tends towards +1 then the cluster is well defined
    
Note: If Davies-Bouldin Index is low, it suggests better clustering
    
Note: If Calinski-Harabaz Index is higher, then better is the cluster definition

The silhouette score and Davies Bouldin Index are the same for both Normal clustering and agglomerative clustering. When we comapre the Calinski-Harabaz index, it is Higher for normal clustering than agglomerative clustering. Therefore the regression dataframe will be created using the results of normal clustering

Using the domain knowledge we can come to a conclusion that cluster 0 indicates diabetes negative (because of low inuslin dose) and cluster 1 indicates diabetes positive (because of high insulin dose and high glucose levels- in case of pre breakfast for example)

In [None]:
X=regression_df.iloc[:,:-1]
Y=regression_df.iloc[:,-1]

degree = range(1, 11)

poly_feature = PolynomialFeatures()
scaler = MinMaxScaler()
reg = LogisticRegression(solver='liblinear')
steps = [("poly_features", poly_feature),("sca0.4ler", scaler),("reg", reg)]
pipeline = Pipeline(steps)

param_grid = {"poly_features__degree": degree,"reg__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

gs = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1')
gs.fit(X, Y)

opt_degree = gs.best_params_["poly_features__degree"]
opt_c = gs.best_params_["reg__C"]
print("Best polynomial degree:", opt_degree)
print("Best C:", opt_c)

gscv_scores = gs.cv_results_['mean_test_score']

degrees=[]
cs=[]
for d in degree:
    degrees.extend([d]*7)
c=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
cs.extend(c*10)
degree_df = pd.DataFrame({'Degree': degrees, 'C': cs,'Scores': gscv_scores})
degree_df

In [None]:
poly_opt=PolynomialFeatures(degree=opt_degree)
X_train_poly=poly_opt.fit_transform(X)
X_train_const = sm.add_constant(X_train_poly)

logreg_opt = sm.Logit(Y, X_train_const)
result = logreg_opt.fit_regularized(method='l1', alpha=opt_c)
print(result.summary())
list_param=list(result.params)[1:]
DF=pd.DataFrame({'Parameter':X.columns,'Coefficient value':list_param})
DF

In [None]:
nph_min=regression_df['NPH_insulin_dose'].min()
nph_max=regression_df['NPH_insulin_dose'].max()
pre_glu_min=regression_df['Pre_supper_glucose_measurement'].min()
pre_glu_max=regression_df['Pre_supper_glucose_measurement'].max()
post_glu_min=regression_df['Post_supper_glucose_measurement'].min()
post_glu_max=regression_df['Post_supper_glucose_measurement'].max()

In [None]:
print('Welcome User. Enter the required details as requested')

print('Enter the NPH insulin dose taken')
insulin=float(input())
if(nph_min<=insulin<=nph_max):
    print('Entry recorded')
else:
    print(f'Wrong entry, re-enter the details.Utilise this prediction model if your results are between {nph_min} and {nph_max}')
    insulin=float(input())

print('Enter the glucose level before supper')
glu_bef=float(input())
if(pre_glu_min<=glu_bef<=pre_glu_max):
    print('Entry recorded')
else:
    print(f'Wrong entry, re-enter the details.Utilise this prediction model if your results are between {pre_glu_min} and {pre_glu_max}')
    glu_bef=float(input())

print('Enter the glucose level after supper')
glu_af=float(input())
if(post_glu_min<=glu_af<=post_glu_max):
    print('Entry recorded')
else:
    print(f'Wrong entry, re-enter the details.Utilise this prediction model if your results are between {post_glu_min} and {post_glu_max}')
    glu_af=float(input())

In [None]:
rv_df=pd.DataFrame({'Regular_insulin_dose':[0],'NPH_insulin_dose':[insulin],'UltraLente_insulin_dose':[0],'Pre_supper_glucose_measurement':[glu_bef],'Post_supper_glucose_measurement':[glu_af],'Pre_breakfast_glucose_measurement':[0]},index=[0])
rv_const=sm.add_constant(rv_df)
rv_const
rv_const['const'] = 0

In [None]:
pred = result.predict(rv_const.values)
pred_list = pred.tolist()
pred_normal = [format(p, 'f') for p in pred_list]
if(float(pred_normal[0])<0.5):
    print('The patient is diabetes negative')
else:
    print('The patient is diabetes positive')