# Importing the libraries

In [48]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

# Data Collection and Analysis

In [49]:
data = pd.read_csv('diabetes_prediction_dataset.csv')

In [50]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [51]:
data.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [53]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [54]:
data['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [55]:
data.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [56]:
df_no_diabetes = data[data['diabetes'] == 0]
df_diabetes = data[data['diabetes'] == 1]

In [57]:
df_no_diabetes_sampled = df_no_diabetes.sample(n=8500, random_state=42)

In [58]:
df_balanced = pd.concat([df_no_diabetes_sampled, df_diabetes])

In [59]:
data = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [60]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Male,80.0,1,0,former,27.32,8.2,140,1
1,Female,35.0,0,0,No Info,27.32,4.5,158,0
2,Female,55.0,1,0,never,27.32,8.8,220,1
3,Male,68.0,0,0,former,28.19,3.5,100,0
4,Female,26.0,0,0,not current,21.38,5.8,155,0
...,...,...,...,...,...,...,...,...,...
16995,Female,39.0,0,0,never,39.69,6.1,200,1
16996,Male,68.0,0,0,never,28.56,6.8,220,1
16997,Female,39.0,0,0,never,22.44,4.8,145,0
16998,Male,26.0,0,0,No Info,27.32,6.5,80,0


In [61]:
data['diabetes'].value_counts()

diabetes
1    8500
0    8500
Name: count, dtype: int64

In [62]:
data['smoking_history'].value_counts()

smoking_history
never          6289
No Info        4680
former         2317
current        1710
not current    1218
ever            786
Name: count, dtype: int64

In [63]:
data['gender'] = data['gender'].map({'Male': 1, 'Female': 0})

In [64]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1.0,80.0,1,0,former,27.32,8.2,140,1
1,0.0,35.0,0,0,No Info,27.32,4.5,158,0
2,0.0,55.0,1,0,never,27.32,8.8,220,1
3,1.0,68.0,0,0,former,28.19,3.5,100,0
4,0.0,26.0,0,0,not current,21.38,5.8,155,0
...,...,...,...,...,...,...,...,...,...
16995,0.0,39.0,0,0,never,39.69,6.1,200,1
16996,1.0,68.0,0,0,never,28.56,6.8,220,1
16997,0.0,39.0,0,0,never,22.44,4.8,145,0
16998,1.0,26.0,0,0,No Info,27.32,6.5,80,0


In [65]:
data['gender'].dtype

dtype('float64')

In [66]:
data[data['smoking_history']=='ever']

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
21,1.0,47.0,0,0,ever,33.32,6.6,85,0
57,0.0,61.0,0,0,ever,41.47,7.5,280,1
65,0.0,53.0,1,0,ever,32.96,5.0,126,0
120,0.0,75.0,0,1,ever,33.40,6.1,159,1
130,0.0,41.0,0,0,ever,40.56,6.2,145,0
...,...,...,...,...,...,...,...,...,...
16956,0.0,49.0,0,0,ever,33.82,5.8,145,0
16974,1.0,75.0,0,1,ever,27.32,9.0,240,1
16983,0.0,30.0,0,0,ever,26.57,6.2,159,0
16985,0.0,52.0,0,0,ever,20.38,6.6,158,0


In [67]:
data['smoking_history'] = data['smoking_history'].map({
    'former': 1,
    'not current': 1,
    'current': 2,
    'never': 0,
    'ever': 0,
    'No Info': -1
})

In [68]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1.0,80.0,1,0,1,27.32,8.2,140,1
1,0.0,35.0,0,0,-1,27.32,4.5,158,0
2,0.0,55.0,1,0,0,27.32,8.8,220,1
3,1.0,68.0,0,0,1,28.19,3.5,100,0
4,0.0,26.0,0,0,1,21.38,5.8,155,0
...,...,...,...,...,...,...,...,...,...
16995,0.0,39.0,0,0,0,39.69,6.1,200,1
16996,1.0,68.0,0,0,0,28.56,6.8,220,1
16997,0.0,39.0,0,0,0,22.44,4.8,145,0
16998,1.0,26.0,0,0,-1,27.32,6.5,80,0


In [69]:
data = data.dropna()

In [70]:
data.shape

(16998, 9)

In [71]:
x = data.drop(columns='diabetes', axis=1)
y = data['diabetes']

In [72]:
x

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,1.0,80.0,1,0,1,27.32,8.2,140
1,0.0,35.0,0,0,-1,27.32,4.5,158
2,0.0,55.0,1,0,0,27.32,8.8,220
3,1.0,68.0,0,0,1,28.19,3.5,100
4,0.0,26.0,0,0,1,21.38,5.8,155
...,...,...,...,...,...,...,...,...
16995,0.0,39.0,0,0,0,39.69,6.1,200
16996,1.0,68.0,0,0,0,28.56,6.8,220
16997,0.0,39.0,0,0,0,22.44,4.8,145
16998,1.0,26.0,0,0,-1,27.32,6.5,80


In [73]:
y

0        1
1        0
2        1
3        0
4        0
        ..
16995    1
16996    1
16997    0
16998    0
16999    1
Name: diabetes, Length: 16998, dtype: int64

In [74]:
scaler = StandardScaler()

In [75]:
scaler.fit(x)

In [76]:
standardized_data = scaler.transform(x)

In [77]:
print(standardized_data)

[[ 1.12413682  1.36787805  2.36667806 ... -0.28507441  1.58598933
  -0.40801057]
 [-0.88957144 -0.73048151 -0.42253318 ... -0.28507441 -1.29588056
  -0.09206706]
 [-0.88957144  0.20212274  2.36667806 ... -0.28507441  2.05331958
   0.9961828 ]
 ...
 [-0.88957144 -0.54396066 -0.42253318 ... -0.94519828 -1.06221544
  -0.32024849]
 [ 1.12413682 -1.15015342 -0.42253318 ... -0.28507441  0.26188695
  -1.46115561]
 [-0.88957144  1.04146656 -0.42253318 ... -0.28507441 -0.28333168
  -0.05696223]]


In [78]:
x = standardized_data

In [79]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [80]:
print(x.shape, x_train.shape, x_test.shape)

(16998, 8) (13598, 8) (3400, 8)


In [81]:
classifier = svm.SVC(kernel='linear')

In [82]:
classifier.fit(x_train, y_train)

# Accuracy Score

In [83]:
x_train_prediction = classifier.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [84]:
print(f'Accuracy Score of the training data: {training_data_accuracy}')

Accuracy Score of the training data: 0.8849095455214002


In [85]:
x_test_prediction = classifier.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [86]:
print(f'Accuracy Score of the test data: {test_data_accuracy}')

Accuracy Score of the test data: 0.8888235294117647


# Getting the inputs

In [87]:
# gend = str(input('Enter your gender (m/f): '))
# if gend=='m':
#     gend = 1
# elif gend=='f':
#     gend = 0
# else:
#     print('Invalid Input')

# age = eval(input('Enter your age: '))

# ht = str(input('Do you have hypertension? (y/n): '))
# if ht=='y':
#     ht = 1
# elif ht=='n':
#     ht = 0
# else:
#     print('Invalid Input')

# hd = str(input('Do you suffer from any form of heart disease? (y/n): '))
# if hd=='y':
#     hd = 1
# elif hd=='n':
#     hd = 0
# else:
#     print('Invalid Input')

# sh = str(input('What is your smoking history? (never smoked/former smoker/current smoker/do not wish to disclose): '))
# if (sh=='never smoked'):
#     sh = 0
# elif (sh=='former smoker'):
#     sh = 1
# elif(sh=='current smoker'):
#     sh = 2
# elif(sh=='do not wish to disclose'):
#     sh = -1
# else:
#     print('Invalid Input')

# bmi = eval(input('Enter your bmi: '))

# hg_level = eval(input('Enter your haemoglobin level: '))

# bgl = eval(input('Enter your blood glucose level: '))

In [88]:
#input = [gend,age,ht,hd,sh,bmi,hg_level,bgl]

# Making the prediction

In [89]:
# input_data_as_np_array = np.asarray(input)

input_data_as_np_array = np.asarray([1.0, 80.0, 1, 0, 1, 27.32, 8.2, 140])
#reshaping the array
input_data_reshaped = input_data_as_np_array.reshape(1,-1)

#standardizing the input data
std_data = scaler.transform(input_data_reshaped)

prediction = classifier.predict(std_data)

if (prediction[0] == 0):
    print('The person is NOT diabetic')
else:
    print('The person is diabetic')

The person is diabetic




In [91]:
import pickle

# Save to file in the current working directory
pkl_filename = "classifier.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(classifier, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    


In [93]:
input_data_as_np_array = np.asarray([1.0, 80.0, 1, 0, 1, 27.32, 8.2, 140])
#reshaping the array
input_data_reshaped = input_data_as_np_array.reshape(1,-1)

#standardizing the input data
std_data = scaler.transform(input_data_reshaped)
classifier.predict(std_data)



array([1], dtype=int64)