In [1]:
## IMPORTS
%matplotlib inline
import os
import easygui
import seaborn as sns
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler
from matplotlib import cm

warnings.filterwarnings('ignore') 

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
for col in df.columns:
    if df[col].dtypes == 'float64':
        print("Column {} userfull".format(col))
    else:
        print("Column {} is not userfull".format(col))
        df = df.drop( columns = [col])

Column Pregnancies is not userfull
Column Glucose is not userfull
Column BloodPressure is not userfull
Column SkinThickness is not userfull
Column Insulin is not userfull
Column BMI userfull
Column DiabetesPedigreeFunction userfull
Column Age is not userfull
Column Outcome is not userfull


In [4]:
df

Unnamed: 0,BMI,DiabetesPedigreeFunction
0,33.6,0.627
1,26.6,0.351
2,23.3,0.672
3,28.1,0.167
4,43.1,2.288
...,...,...
763,32.9,0.171
764,36.8,0.340
765,26.2,0.245
766,30.1,0.349


In [5]:
def get_best_distribution(data):
    dist_names = ["alpha", "anglit", "arcsine", "argus", "beta", "betaprime", "bradford", "burr", "burr12", 
                  "cauchy", "chi","chi2","cosine","crystalball", "dgamma", "dweibull", "erlang", "expon", 
                  "exponnorm","exponweib", "exponpow", "f", "fatiguelife", "fisk", "foldcauchy", "foldnorm", 
                  "frechet_r", "frechet_l", "genlogistic", "gennorm", "genpareto", "genexpon", "genextreme",
                  "gausshyper", "gamma", "gengamma", "genhalflogistic", "gilbrat", "gompertz", 
                  "gumbel_r", "gumbel_l", "halfcauchy", "halflogistic", "halfnorm", "halfgennorm", "hypsecant", 
                  "invgamma", "invweibull", "johnsonsb", "johnsonsu", "kappa4", "kappa3", "ksone", 
                  "kstwobign", "laplace", "levy", "levy_l", "logistic", "loggamma", "loglaplace",
                  "lognorm", "loguniform", "lomax", "maxwell", "mielke", "moyal", "nakagami", "ncx2","ncf",
                  "norm", "pareto", "pearson3", "powerlaw", "powerlognorm", "powernorm","rdist", 
                  "rayleigh", "rice", "semicircular", "skewnorm", "t", "trapz", "triang",
                  "truncexpon", "uniform", "vonmises", "vonmises_line", "wald", "weibull_min",
                  "weibull_max", "wrapcauchy"]
    dist_results = []
    params = {}
    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)
        params[dist_name] = param
        # Applying the Kolmogorov-Smirnov test
        D, p = st.kstest(data, dist_name, args=param)
#         print("p value for "+dist_name+" = "+str(p))
        dist_results.append((dist_name, p))
    # select the best fitted distribution
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value
    print("Best fitting distribution: "+str(best_dist))
    print("Best p value: "+ str(best_p))
    print("Parameters for the best fit: "+ str(params[best_dist]))
    return best_dist, best_p, params[best_dist]

In [6]:
dist_list = []
for col in df.columns:
    print("-------------- {} -------------".format(col))
    dist,_,_ = get_best_distribution(df[col])
    dist_list.append(dist)
    print("----------------------------------")

-------------- BMI -------------
Best fitting distribution: johnsonsu
Best p value: 0.6110030131750179
Parameters for the best fit: (-0.07418182840818902, 1.6576717964771226, 31.479612264470276, 10.607474080243495)
----------------------------------
-------------- DiabetesPedigreeFunction -------------
Best fitting distribution: exponnorm
Best p value: 0.6178336919237628
Parameters for the best fit: (8.614506825032835, 0.13600483035238153, 0.03898905641231575)
----------------------------------


In [7]:
dist_list

['johnsonsu', 'exponnorm']

In [8]:
for col,dist_ in zip(df.columns,dist_list):
    

    plt.hist(df[col], bins=25, density=True, alpha=0.6, color='g')

    # Plot the PDF.
    mu, std = st.norm.fit(df[col])
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, len(df))
    p = st.norm.pdf(x, mu, std)
    plt.plot(x, p,"k:", linewidth=2, label = "norm")
    
    # Plot second PDF.
    dist = getattr(st, dist_)
    param = dist.fit(df[col])
    mu1 = param[len(param)-2] 
    std1 = param[len(param)-1]
    ymin, ymax = plt.xlim()
    y = np.linspace(ymin, ymax, len(df))
    q = dist.pdf(y,*param[:-2] ,loc = mu1, scale = std1)
    plt.plot(y, q, 'r', linewidth = 1, label = dist_)
#     title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
#     plt.title(title)

    plt.legend()
    plt.savefig("images/"+col+".jpg")
    plt.clf()



<Figure size 432x288 with 0 Axes>

0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: BMI, Length: 768, dtype: float64