In [26]:
# Importing Dependencies

# numpy and pandas for creating dataframes
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
# matplotlib sometimes give an error of module not found, So I ran this command in windows power shell : 
# pip install msvc-runtime

# StandardScalar for standardization of data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# split data into training and testing 
from sklearn.model_selection import train_test_split

from sklearn import svm 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor


In [27]:
diabetesDataset = pd.read_csv("diabetes_prediction_dataset.csv")

Printing the head of dataset

In [28]:
# printing first five rows of DataFrame 
# .head() automatically brings first 5 rows

diabetesDataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0.0,1.0,never,25.19,6.6,140.0,0
1,Female,54.0,0.0,0.0,No Info,27.32,6.6,80.0,0
2,Male,28.0,0.0,0.0,never,27.32,5.7,158.0,0
3,Female,36.0,0.0,0.0,current,23.45,5.0,155.0,0
4,Male,76.0,1.0,1.0,current,20.14,4.8,155.0,0


Dimensions of Dataset

In [29]:
diabetesDataset.shape

(100513, 9)

Statistics of Dataset

In [30]:
diabetesDataset.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100513.0,100510.0,100512.0,100513.0,100513.0,100512.0,100513.0
mean,41.953753,0.077296,0.040682,27.338273,5.549894,138.844188,0.08967
std,22.491265,0.267062,0.197553,6.641393,1.123745,42.589711,0.28571
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.65,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.63,6.2,159.0,0.0
max,90.0,1.0,1.0,95.69,18.3,642.0,1.0


Counts of instances of classes in various attributes

In [31]:
# Counting Instances of Classes in Label Column ( Output Column )

# diabetesDataset['diabetes'].value_counts()
# diabetesDataset['age'].value_counts()
# diabetesDataset['hypertension'].value_counts()
# diabetesDataset['HbA1c_level'].value_counts()
# diabetesDataset['blood_glucose_level'].value_counts()
# diabetesDataset['bmi'].value_counts()
# diabetesDataset['gender'].value_counts()
# diabetesDataset['heart_disease'].value_counts()
# diabetesDataset['hypertension'].value_counts()
diabetesDataset['smoking_history'].value_counts()

smoking_history
No Info        35816
never          35545
former          9373
current         9326
not current     6447
ever            4004
Name: count, dtype: int64

Plotting Graph

In [32]:
# diabetesDataset["age"].plot()

Dropping null values 

In [33]:
diabetesDataset = diabetesDataset.dropna()
diabetesDataset = diabetesDataset.reset_index(drop=True)

In [34]:
# Again verifying the dimensions of dataframe
diabetesDataset.shape

# with null values (101015, 9)
# without null values (100509, 9)
# 506 rows have been cleaned 

(100509, 9)

In [35]:
diabetesDataset

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0.0,1.0,never,25.19,6.6,140.0,0
1,Female,54.0,0.0,0.0,No Info,27.32,6.6,80.0,0
2,Male,28.0,0.0,0.0,never,27.32,5.7,158.0,0
3,Female,36.0,0.0,0.0,current,23.45,5.0,155.0,0
4,Male,76.0,1.0,1.0,current,20.14,4.8,155.0,0
...,...,...,...,...,...,...,...,...,...
100504,Female,49.0,1.0,0.0,never,38.32,8.5,295.0,1
100505,Male,69.0,1.0,1.0,never,22.76,6.9,140.0,1
100506,Female,65.0,1.0,0.0,never,27.99,7.5,206.0,1
100507,Male,50.0,0.0,1.0,current,36.69,13.6,250.0,1


In [36]:
# Delete rows where the `smoking_history` is `no info` 
# df = df.drop(df[df['city'] == 'No Info'].index) 

Encoding features `gender` and `smoking_history`

In [37]:
# Creating an instance of label Encoder class
label_encoder_instance = LabelEncoder()

# Encoding the `gender` feature ( column ) of `diabetesDataset` using instance of LabelEncoder() class and method fit.transform()
encoded_gender = label_encoder_instance.fit_transform(diabetesDataset['gender'])
encoded_smoking_history = label_encoder_instance.fit_transform(diabetesDataset['smoking_history'])

In [38]:
# an array of encoded gender values
# female = 0
# male = 1
# other = 2

# an array of encoded smoking_history values 
# No Info        0
# current        1
# ever           2
# former         3
# never          4  
# not current    5

print(encoded_gender)
print(encoded_smoking_history)

[0 0 1 ... 0 1 0]
[4 0 4 ... 4 1 4]


Transforming the encoded array into a Dataframe 

In [39]:
encoded_dataframe_gender =  pd.DataFrame(encoded_gender)
encoded_dataframe_smoking_history = pd.DataFrame(encoded_smoking_history)
print(encoded_gender)
print(encoded_dataframe_gender.value_counts())
print("*****************")
print(encoded_smoking_history)
print(encoded_dataframe_smoking_history.value_counts())

[0 0 1 ... 0 1 0]
0
0    58913
1    41578
2       18
Name: count, dtype: int64
*****************
[4 0 4 ... 4 1 4]
0
0    35816
4    35543
3     9373
1     9326
5     6447
2     4004
Name: count, dtype: int64


Appending the encoded gender values into original DataFrame

In [40]:
# Pass the features ( columns ) of original dataframe in the array that you intend to replace
# columns_to_overwrite = ["gender", "smoking_history"]

# Deleting the columns, we intend to replace
diabetesDataset.drop(labels=columns_to_overwrite, axis="columns", inplace=True)

diabetesDataset

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,80.0,0.0,1.0,25.19,6.6,140.0,0
1,54.0,0.0,0.0,27.32,6.6,80.0,0
2,28.0,0.0,0.0,27.32,5.7,158.0,0
3,36.0,0.0,0.0,23.45,5.0,155.0,0
4,76.0,1.0,1.0,20.14,4.8,155.0,0
...,...,...,...,...,...,...,...
100504,49.0,1.0,0.0,38.32,8.5,295.0,1
100505,69.0,1.0,1.0,22.76,6.9,140.0,1
100506,65.0,1.0,0.0,27.99,7.5,206.0,1
100507,50.0,0.0,1.0,36.69,13.6,250.0,1


Re-inserting the features with encoded values into original dataframe

In [41]:
# .insert( index where you want to insert, "Label of Column", value)
diabetesDataset.insert(0, "Encoded Gender", encoded_dataframe_gender)
diabetesDataset


Unnamed: 0,Encoded Gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0.0,1.0,25.19,6.6,140.0,0
1,0,54.0,0.0,0.0,27.32,6.6,80.0,0
2,1,28.0,0.0,0.0,27.32,5.7,158.0,0
3,0,36.0,0.0,0.0,23.45,5.0,155.0,0
4,1,76.0,1.0,1.0,20.14,4.8,155.0,0
...,...,...,...,...,...,...,...,...
100504,0,49.0,1.0,0.0,38.32,8.5,295.0,1
100505,1,69.0,1.0,1.0,22.76,6.9,140.0,1
100506,0,65.0,1.0,0.0,27.99,7.5,206.0,1
100507,1,50.0,0.0,1.0,36.69,13.6,250.0,1


In [42]:
diabetesDataset.insert(4, "Encoded smoking_history", encoded_dataframe_smoking_history)
diabetesDataset

Unnamed: 0,Encoded Gender,age,hypertension,heart_disease,Encoded smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0.0,1.0,4,25.19,6.6,140.0,0
1,0,54.0,0.0,0.0,0,27.32,6.6,80.0,0
2,1,28.0,0.0,0.0,4,27.32,5.7,158.0,0
3,0,36.0,0.0,0.0,1,23.45,5.0,155.0,0
4,1,76.0,1.0,1.0,1,20.14,4.8,155.0,0
...,...,...,...,...,...,...,...,...,...
100504,0,49.0,1.0,0.0,4,38.32,8.5,295.0,1
100505,1,69.0,1.0,1.0,4,22.76,6.9,140.0,1
100506,0,65.0,1.0,0.0,4,27.99,7.5,206.0,1
100507,1,50.0,0.0,1.0,1,36.69,13.6,250.0,1


In [43]:
diabetesDataset = diabetesDataset.astype(float)
diabetesDataset

Unnamed: 0,Encoded Gender,age,hypertension,heart_disease,Encoded smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0.0,80.0,0.0,1.0,4.0,25.19,6.6,140.0,0.0
1,0.0,54.0,0.0,0.0,0.0,27.32,6.6,80.0,0.0
2,1.0,28.0,0.0,0.0,4.0,27.32,5.7,158.0,0.0
3,0.0,36.0,0.0,0.0,1.0,23.45,5.0,155.0,0.0
4,1.0,76.0,1.0,1.0,1.0,20.14,4.8,155.0,0.0
...,...,...,...,...,...,...,...,...,...
100504,0.0,49.0,1.0,0.0,4.0,38.32,8.5,295.0,1.0
100505,1.0,69.0,1.0,1.0,4.0,22.76,6.9,140.0,1.0
100506,0.0,65.0,1.0,0.0,4.0,27.99,7.5,206.0,1.0
100507,1.0,50.0,0.0,1.0,1.0,36.69,13.6,250.0,1.0


In [44]:
output_feature = diabetesDataset['HbA1c_level']
input_features = diabetesDataset.drop(columns = 'HbA1c_level', axis = 1)

In [45]:
output_feature

0          6.6
1          6.6
2          5.7
3          5.0
4          4.8
          ... 
100504     8.5
100505     6.9
100506     7.5
100507    13.6
100508    11.1
Name: HbA1c_level, Length: 100509, dtype: float64

DataFrame to new csv file

In [46]:
csv_dataframe = pd.concat([input_features, output_feature])
csv_dataframe.to_csv("encoded_float_diabetes_prediction_dataset.csv", sep=',', index=False, encoding='utf-8')

In [47]:
input_features

Unnamed: 0,Encoded Gender,age,hypertension,heart_disease,Encoded smoking_history,bmi,blood_glucose_level,diabetes
0,0.0,80.0,0.0,1.0,4.0,25.19,140.0,0.0
1,0.0,54.0,0.0,0.0,0.0,27.32,80.0,0.0
2,1.0,28.0,0.0,0.0,4.0,27.32,158.0,0.0
3,0.0,36.0,0.0,0.0,1.0,23.45,155.0,0.0
4,1.0,76.0,1.0,1.0,1.0,20.14,155.0,0.0
...,...,...,...,...,...,...,...,...
100504,0.0,49.0,1.0,0.0,4.0,38.32,295.0,1.0
100505,1.0,69.0,1.0,1.0,4.0,22.76,140.0,1.0
100506,0.0,65.0,1.0,0.0,4.0,27.99,206.0,1.0
100507,1.0,50.0,0.0,1.0,1.0,36.69,250.0,1.0


In [48]:
output_feature

0          6.6
1          6.6
2          5.7
3          5.0
4          4.8
          ... 
100504     8.5
100505     6.9
100506     7.5
100507    13.6
100508    11.1
Name: HbA1c_level, Length: 100509, dtype: float64

In [49]:
# remove stratify parameter if your output variable is float.
input_features_train, input_features_test, output_feature_train, output_feature_test = train_test_split(input_features, output_feature, test_size=0.2, random_state=1)

In [50]:
print(input_features.shape, input_features_train.shape, input_features_test.shape )

(100509, 8) (80407, 8) (20102, 8)


Model Training ( Decision Tree Regression )

In [51]:
decision_tree = DecisionTreeRegressor()

Training the Logistic Regression Model with Training Data 

In [52]:
decision_tree.fit(input_features_train, output_feature_train)

In [53]:
# Accuracy on Training Data 

input_features_train_prediction = decision_tree.predict(input_features_train)

meanSquaredError = mean_squared_error(output_feature_train, input_features_train_prediction)
meanAbsoluteError = mean_absolute_error(output_feature_train, input_features_train_prediction)
r2 = r2_score(output_feature_train, input_features_train_prediction)

In [54]:
print("Mean Squared Error", meanSquaredError)

Mean Squared Error 0.14304941779839447


In [55]:
print("Mean Absolute Error", meanSquaredError)

Mean Absolute Error 0.14304941779839447


In [56]:
print("R2 Score", r2)

R2 Score 0.8858752871660557


In [67]:
# Making a Predictive System    

# import numpy as np


# inputData = (1.0, 55.0, 0.0, 0.0, 0.0, 27.32, 159.0, 1.0) # computed = 6.8, correct value = 6.8, row # 238
# inputData = (1.0, 57.0, 1.0, 1.0, 5.0, 27.77, 160.0, 1.0) # computed = 5.8, correct value = 6.6, row # 244
# inputData = (0.0, 34.0, 0.0, 0.0, 3.0, 26.55, 158.0, 0.0) # computed = 6.1, correct value = 6.1, row # 2897
inputData = (1.0, 53.0, 0.0, 0.0, 2.0, 26.47, 126.0, 0.0) # computed = 6.5, correct value = 6.5, row # 2944




inputDataNumpyArray = np.asarray(inputData)

# reshape np array as we are predicting for one instance 
# 1, -1 represents that we are predicting for one instance 

inputDataReshaped = inputDataNumpyArray.reshape(1,-1)

prediction = decision_tree.predict(inputDataReshaped)

print(prediction)


[6.]


