In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("diabetes.csv")
df.shape

(768, 9)

In [3]:
df.isnull().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

In [5]:
vif

Unnamed: 0,VIF,Features
0,3.275748,Pregnancies
1,16.725078,Glucose
2,14.619512,BloodPressure
3,4.008696,SkinThickness
4,2.063689,Insulin
5,18.408884,BMI
6,3.195626,DiabetesPedigreeFunction
7,13.492985,Age


In [6]:
X = df.drop(columns = 'Outcome', axis=1)
Y = df['Outcome']

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [8]:
chi2_selector = SelectKBest(chi2,k=8)
X_kbest = chi2_selector.fit_transform(X, Y)

In [9]:
p_values=pd.Series(X_kbest[0])
p_values.index=X.columns
p_values.sort_values(ascending=False)

Glucose                     148.000
BloodPressure                72.000
Age                          50.000
SkinThickness                35.000
BMI                          33.600
Pregnancies                   6.000
DiabetesPedigreeFunction      0.627
Insulin                       0.000
dtype: float64

In [10]:
applicable_features = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]
x_new = X[applicable_features]


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(x_new,Y, test_size = 0.2, stratify=Y, random_state=2)

In [12]:
from sklearn.ensemble import RandomForestRegressor


In [13]:
rfg_model = RandomForestRegressor(
    n_estimators=250,
    criterion='squared_error',
    random_state=2004,
)

In [14]:
rfg_model.fit(X_train, Y_train)


In [15]:
preds = rfg_model.predict(X_test)

In [16]:
for i,y in np.ndenumerate(preds):
    if y<0.5:
        preds[i] = 0
    else:
        preds[i] = 1


In [17]:
accuracy=accuracy_score(Y_test,preds.astype(int))
print(str(accuracy*100)+" % accuracy")

74.67532467532467 % accuracy


In [21]:
input_data = (1,189,60,23,846,30.1,0.398,59)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = rfg_model.predict(input_data_reshaped)
print(prediction)

[0.888]




In [20]:
filename = 'diabetes_model.sav'
pickle.dump(rfg_model, open(filename, 'wb'))