In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples
from matplotlib.cm import get_cmap

warnings.filterwarnings("ignore")

In [None]:
url = 'https://drive.google.com/file/d/135r0zppiZruDktBfu2cYgeBZ0pe5FTW7/view?usp=sharing'
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
Diabetes = pd.read_csv(path)


In [None]:
Diabetes.head(5)

In [None]:
df=Diabetes.copy()
df.info()

In [None]:
#Change Data type age: float to Integer
df.astype({"age":int})

In [None]:
#Check for empty values.
df.isnull().sum()

In [None]:
#Check for duplications values and drop them
df.duplicated().value_counts()
print("Number of Duplicated Values = ",df.duplicated().sum())

df.drop_duplicates().count()

In [None]:
df['smoking_history'].value_counts()

In [None]:
df['gender'].value_counts()

In [None]:
df['diabetes'].value_counts()

In [None]:
# Percentage of Non_diabetic vs Diabetic patients
df['diabetes'].value_counts().plot(kind='pie',autopct='%0.2f')
plt.title("Distribution of diabeties in dataset")

In [None]:
#Gender vs Diabetes
gen=sns.countplot(x='diabetes',data=df,hue='gender')

for bars in gen.containers:
    gen.bar_label(bars)

In [None]:
#detecting correlation between diabeted and key features
sns.heatmap(df.corr(),annot=True)

In [None]:
#mapping character values to numerical
df['gender_num']=df['gender'].map({'Female':0,'Male':1,'Other':2})
df['smoking_history_num']=df['smoking_history'].map({'No Info':-1,'never':0,
                                                     'former':1,'current':2,
                                                     'not current':3,'ever':4})
df=df.drop('smoking_history',axis=1)
df=df.drop('gender',axis=1)

In [None]:
#Re-check correlation
#sns.heatmap(df.corr(),annot=True)

In [None]:
#Address skewed variables (features) by transforming

sns.boxplot(x="diabetes", y="bmi", data=df)
plt.title("BMI Distribution by Diabetes Status")
plt.show()

sns.boxplot(x="diabetes", y="HbA1c_level", data=df)
plt.title("HbA1c Level Distribution by Diabetes Status")
plt.show()

sns.boxplot(x="diabetes", y="blood_glucose_level", data=df)
plt.title("Blood Glucose Level Distribution by Diabetes Status")
plt.show()

In [None]:
#Check outliers and remove them
#Q1=df["bmi"].quantile(0.25)
#Q3=df["bmi"].quantile(0.75)
#print(Q1,Q3)
#IQR = Q3 - Q1
#print(IQR)
#lowerlimit = Q1 - 1.5*IQR
#upperlimit = Q3 + 1.5*IQR
#print(lowerlimit, upperlimit)
#df1 = df[(df['bmi']<lowerlimit)|(df['bmi']>upperlimit)]
#print(df1)

In [None]:
df.skew()

In [None]:
#sqr transformation
#df1["smoking_history_num"]=np.sqrt(df1["smoking_history_num"])
#df1["hypertension"]=np.sqrt(df1["hypertension"])
df["bmi"]=np.sqrt(df["bmi"])
#df1["heart_disease"]=np.sqrt(df1["heart_disease"])

In [None]:
df.skew()

In [None]:
for i in df.columns:
  sns.distplot(df[i], kde=True)
  plt.show()
  print()

In [None]:
df.describe()

In [None]:
X=df.drop(['diabetes'],axis=1)
y=df['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=123)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)


In [None]:


# determine the columns that will need to go to the numerical pipeline
# also determine those columns that need to go to the categorical pipeline
#X_num = X.select_dtypes(include=['int64', 'float64']).copy() #columns
#X_cat = X.select_dtypes(include=['object']).copy() #columns

In [None]:
#Train my Model 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
num_pipe = make_pipeline(SimpleImputer())
num_pipe

In [None]:
from sklearn.compose import ColumnTransformer
full_processor = ColumnTransformer(transformers=[
                        ("numeric_pipe", num_pipe, X_num),
                       
                       ])
full_processor

In [None]:
#Scaling thee values between the range[-1,1]
scaler = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,15))])

In [None]:
#scaler = ColumnTransformer([
    #('scale',StandardScaler())])

In [None]:
#1.model training
#dtc= DecisionTreeClassifier()
#dtc= DecisionTreeClassifier()
#lr=LogisticRegression()
lr=LogisticRegression(solver='newton-cg',penalty='l2', C=0.001, max_iter=1000)

In [None]:
pipe = make_pipeline(scaler,lr)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
#Checking Accuracy of Training Dataset¶
y_pred=pipe.predict(X_train)
print("Accuracy of Training data set is : ",accuracy_score(y_train,y_pred))

# Accuracy Checking: Cross Validation
print("Accuracy after Cross Validation :",cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean())

In [None]:
#Prediction with my Test Dataset
y_pred = pipe.predict(X_test)
print("Accuracy of Test data set is : ",accuracy_score(y_test,y_pred))

# Accuracy Checking: Cross Validation
print("Accuracy after Cross Validation :",cross_val_score(pipe, X_test, y_test,
                                                          cv=5, scoring='accuracy').mean())

In [None]:
import pickle
pickle.dump(pipe,open('pipe_pkl1','wb'))

In [None]:
pipe=pickle.load(open('pipe_pkl1','rb'))

In [None]:
#1.Age:(>1)

#2.Hypertension[0,1]: No Hypertension=0, Hypertension=1

#3.Heart_disease[0,1]: No Heart_disease=0,Heart_disease=1

#4.BMI(> 16.0)

#5.HbA1c_level-: (> 4.0 )

#6.Blood_glucose_level(>70)

#7.Gender =:Female=0, Male=1, Other=2

#8.Smoking_history =:No Info = -1, never = 0, former=1, current = 2,not current = 3, ever = 4

#Sequence of variable as in Dataset: ARRAY( 1.age, 2.hypertension, 3.heart_disease, 4.bmi, 
#6.HbA1c_level, 7.blood_glucose_level,7.gender,8.smoking_history)

In [None]:
#new_input=np.array([69.0,0,1,4.6,7.5,160,0,2],dtype=object).reshape(1,8)
#prediction=pipe.predict(new_input)
#print(prediction)
#if (prediction[0]==1):
    #print("The Person is Diabetic")
#else:
    #print("The Person is Not Diabetic")

In [None]:
#new_input=np.array([4.0,0,0,4.0,3.5,160,0,-1],dtype=object).reshape(1,8)
#prediction=pipe.predict(new_input)
#print(prediction)
#if (prediction[0]==1):
    #print("The Person is Diabetic")
#else:
    #print("The Person is Not Diabetic")

In [None]:
#prediction

In [None]:
import pickle
import streamlit as st

pipe=pickle.load(open('pipe_pkl1','rb'))
new_input = pd.DataFrame({
                        'age':[4],
                        'hypertension':[0],
                        'heart_disease':[0],
                        'bmi':[4.0],
                        'HbA1c_level':[3.5],
                        'blood_glucose_level':[160],
                        'gender_num':[0],
                        'smoking_history_num':[-1]   
})

new_input=new_input.rename({"age ":"age"},axis=1)

prediction = pipe.predict(new_input)

print(prediction)

if (prediction[0]==1):
    print("The Person is Diabetic")
else:
    print("The Person is Not Diabetic")
    
#st.write("The Person is:", prediction)

In [None]:
prediction

In [None]:
#import pickle
#import streamlit as st
pipe=pickle.load(open('pipe_pkl1','rb'))


In [None]:
import pandas as pd
age = st.number_input('age')
hypertension = st.number_input('hypertension')
heart_disease = st.number_input('heart_disease')
bmi  = st.number_input('bmi')
HbA1c_level   = st.number_input('HbA1c_level')
blood_glucose_level = st.number_input('blood_glucose_level')
gender_num  = st.number_input('gender_num')
smoking_history_num  = st.number_input('smoking_history_num')

new_input = pd.DataFrame({
                     "age":[age],
                     "hypertension":[hypertension], 
                     "heart_disease":[heart_disease],
                     "bmi":[bmi],
                     "HbA1c_level":[HbA1c_level],
                     "blood_glucose_level":[blood_glucose_level],
                     "gender_num":[gender_num],
                     "smoking_history_num":[smoking_history_num]
      
})



In [None]:
#new_input

In [None]:

prediction = pipe.predict(new_input)

st.write("The Person is:", prediction)