In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Loading data as data frame
diabetic_df = pd.read_csv(r"D:\SEM5\DAV\DataRepo\diabetes.csv")
diabetic_df = diabetic_df.drop("Outcome", axis=1)

In [3]:
def fill_missing_Values(df, features):
    for feature in features:
        df[feature] = df[feature].replace(0, np.nan)
        print(f"{feature:<25}   :  {df[feature].isnull().sum()}")
        df[feature] = df[feature].replace(
            np.nan, df[feature].mean()
        )
    if (not(df.isnull().sum().any())):
        print("\nMissing Values filled Successfully\n")
    return not(df.isnull().sum().any())

In [4]:
fill_missing_Values(diabetic_df, diabetic_df.columns)

Pregnancies                 :  111
Glucose                     :  5
BloodPressure               :  35
SkinThickness               :  227
Insulin                     :  374
BMI                         :  11
DiabetesPedigreeFunction    :  0
Age                         :  0

Missing Values filled Successfully



True

In [5]:
def outliers(df, feature):
    outliers = []
    mean_ = df[feature].mean()  
    sd_ = df[feature].std()    

    # Detect outliers
    for pt in df[feature]:
        z = (pt - mean_) / sd_
        if abs(z) >= 3:
            outliers.append(pt)
    
    return outliers

In [6]:
for feature in diabetic_df.columns:
    outliers_ = outliers(diabetic_df, feature)
    print(f"{feature:<25} :: Outliers ::   {outliers_}")

Pregnancies               :: Outliers ::   [15.0, 17.0, 14.0, 14.0]
Glucose                   :: Outliers ::   []
BloodPressure             :: Outliers ::   [30.0, 110.0, 122.0, 30.0, 110.0, 110.0, 24.0, 114.0]
SkinThickness             :: Outliers ::   [60.0, 56.0, 63.0, 99.0]
Insulin                   :: Outliers ::   [543.0, 846.0, 495.0, 485.0, 495.0, 478.0, 744.0, 680.0, 545.0, 465.0, 415.0, 579.0, 474.0, 480.0, 600.0, 440.0, 540.0, 480.0, 510.0]
BMI                       :: Outliers ::   [53.2, 55.0, 67.1, 59.4, 57.3]
DiabetesPedigreeFunction  :: Outliers ::   [2.288, 1.893, 1.781, 2.329, 1.476, 2.137, 1.731, 1.6, 2.42, 1.699, 1.698]
Age                       :: Outliers ::   [69, 72, 81, 70, 69]


In [7]:
def IQR_outliers(df, feature):
    q1, q3 = df[feature].quantile(0.25), df[feature].quantile(0.75)
    IQR = q3-q1
    upper_bound = q3 + (1.5 * IQR)
    lower_bound = q1 - (1.5 * IQR)
    outliers = []
    for pt in df[feature]:
        if ((pt <= lower_bound) or (pt >= upper_bound)):
            outliers.append(pt)
    return outliers

In [8]:
for feature in diabetic_df.columns:
    outliers_ = IQR_outliers(diabetic_df, feature)
    print(f"{feature:<25} :: Outliers ::   {outliers_}")

Pregnancies               :: Outliers ::   [13.0, 13.0, 13.0, 15.0, 17.0, 12.0, 12.0, 13.0, 14.0, 13.0, 12.0, 13.0, 12.0, 12.0, 12.0, 14.0, 12.0, 13.0, 12.0, 13.0, 13.0, 13.0, 12.0]
Glucose                   :: Outliers ::   []
BloodPressure             :: Outliers ::   [40.0, 30.0, 110.0, 108.0, 122.0, 30.0, 110.0, 104.0, 108.0, 104.0, 110.0, 24.0, 38.0, 106.0, 106.0, 106.0, 114.0]
SkinThickness             :: Outliers ::   [45.0, 47.0, 11.0, 47.0, 11.0, 10.0, 60.0, 13.0, 13.0, 54.0, 51.0, 56.0, 14.0, 13.0, 50.0, 44.0, 12.0, 46.0, 44.0, 13.0, 44.0, 54.0, 14.0, 7.0, 50.0, 52.0, 10.0, 44.0, 43.0, 45.0, 14.0, 10.0, 11.0, 12.0, 43.0, 13.0, 12.0, 48.0, 43.0, 43.0, 8.0, 13.0, 14.0, 12.0, 49.0, 46.0, 46.0, 11.0, 8.0, 12.0, 63.0, 12.0, 45.0, 13.0, 48.0, 13.0, 10.0, 45.0, 7.0, 52.0, 49.0, 43.0, 14.0, 47.0, 99.0, 46.0, 11.0, 50.0, 45.0, 14.0, 13.0, 13.0, 47.0, 12.0, 48.0, 43.0, 46.0, 46.0, 45.0, 10.0, 46.0, 49.0, 11.0, 13.0, 46.0, 44.0, 48.0]
Insulin                   :: Outliers ::   [543.0, 8

In [9]:
def handle_outliers(df, feature, outliers):
    mean_ = df[feature].mean()
    for idx, value in enumerate(df[feature]):
        if value in outliers_:
            df.loc[idx, feature] = mean_
    return 1

In [10]:
for feature in diabetic_df.columns:
    outliers_ = IQR_outliers(diabetic_df, feature)
    print(f"{feature:<25} :: Outliers ::   {outliers_}")
    handle_outliers(diabetic_df, feature, outliers_)

Pregnancies               :: Outliers ::   [13.0, 13.0, 13.0, 15.0, 17.0, 12.0, 12.0, 13.0, 14.0, 13.0, 12.0, 13.0, 12.0, 12.0, 12.0, 14.0, 12.0, 13.0, 12.0, 13.0, 13.0, 13.0, 12.0]
Glucose                   :: Outliers ::   []
BloodPressure             :: Outliers ::   [40.0, 30.0, 110.0, 108.0, 122.0, 30.0, 110.0, 104.0, 108.0, 104.0, 110.0, 24.0, 38.0, 106.0, 106.0, 106.0, 114.0]
SkinThickness             :: Outliers ::   [45.0, 47.0, 11.0, 47.0, 11.0, 10.0, 60.0, 13.0, 13.0, 54.0, 51.0, 56.0, 14.0, 13.0, 50.0, 44.0, 12.0, 46.0, 44.0, 13.0, 44.0, 54.0, 14.0, 7.0, 50.0, 52.0, 10.0, 44.0, 43.0, 45.0, 14.0, 10.0, 11.0, 12.0, 43.0, 13.0, 12.0, 48.0, 43.0, 43.0, 8.0, 13.0, 14.0, 12.0, 49.0, 46.0, 46.0, 11.0, 8.0, 12.0, 63.0, 12.0, 45.0, 13.0, 48.0, 13.0, 10.0, 45.0, 7.0, 52.0, 49.0, 43.0, 14.0, 47.0, 99.0, 46.0, 11.0, 50.0, 45.0, 14.0, 13.0, 13.0, 47.0, 12.0, 48.0, 43.0, 46.0, 46.0, 45.0, 10.0, 46.0, 49.0, 11.0, 13.0, 46.0, 44.0, 48.0]
Insulin                   :: Outliers ::   [543.0, 8

  df.loc[idx, feature] = mean_


In [11]:
for feature in diabetic_df.columns:
    outliers_ = IQR_outliers(diabetic_df, feature)
    print(f"{feature:<25} :: Outliers ::   {outliers_}")

Pregnancies               :: Outliers ::   []
Glucose                   :: Outliers ::   []
BloodPressure             :: Outliers ::   []
SkinThickness             :: Outliers ::   [35.0, 35.0, 19.0, 38.0, 41.0, 35.0, 15.0, 19.0, 36.0, 37.0, 42.0, 18.0, 39.0, 15.0, 21.0, 42.0, 39.0, 41.0, 20.0, 35.0, 20.0, 20.0, 22.0, 19.0, 15.0, 40.0, 18.0, 18.0, 18.0, 15.0, 39.0, 42.0, 36.0, 37.0, 20.0, 37.0, 35.0, 17.0, 42.0, 15.0, 21.0, 19.0, 41.0, 38.0, 40.0, 18.0, 42.0, 42.0, 18.0, 20.0, 36.0, 41.0, 39.0, 35.0, 20.0, 41.0, 16.0, 20.0, 16.0, 22.0, 40.0, 41.0, 22.0, 15.0, 38.0, 39.0, 37.0, 21.0, 21.0, 22.0, 35.0, 15.0, 19.0, 35.0, 39.0, 22.0, 16.0, 15.0, 15.0, 18.0, 42.0, 37.0, 15.0, 39.0, 17.0, 37.0, 38.0, 41.0, 37.0, 19.0, 37.0, 17.0, 22.0, 39.0, 20.0, 21.0, 36.0, 19.0, 16.0, 18.0, 21.0, 35.0, 36.0, 19.0, 19.0, 40.0, 40.0, 36.0, 37.0, 17.0, 16.0, 22.0, 40.0, 15.0, 37.0, 39.0, 22.0, 18.0, 36.0, 40.0, 41.0, 39.0, 22.0, 35.0, 21.0, 41.0, 18.0, 22.0, 39.0, 16.0, 15.0, 37.0, 18.0, 22.0, 40.0, 36.0, 41