<b>Dependencies:</b> <br>
    import pandas as pd <br>
    import statistics



In [1]:
# NULL/ZERO VALUES DETECTION

import pandas as pd


diabetes = pd.read_csv('../datasets/diabetes.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
            'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


# Null values detection
print("Number of Null values")
print(diabetes.isnull().sum(), "\n\n")


# Outliers detection: values equal to 0 detection
print("Number of values equal to 0")
zeros_number = {}
for feature in features:
    zeros_number[feature] = (diabetes[diabetes[feature] == 0]).shape[0]
print(pd.Series(data=zeros_number))



Number of Null values
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64 


Number of values equal to 0
Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64


In [2]:
# STATISTICAL ANALYSIS

import pandas as pd
import statistics


def column_stats(dataset, feature, consider_zeros=False):
    """ Provides stats of a given dataset column such as the maximum value, minimum
        value, average value and standard deviation value

        :param dataset: Dataframe with the dataset
        :param feature: str with the feature being analyzed
        :param consider_zeros: bool that indicates if zeros must be considered or not
                               for the stats
    """

    if consider_zeros:
        data =(dataset[feature].values.copy())
    else:
        data =(dataset[dataset[feature] != 0])[feature].values.copy()

    avrg = sum(data)/len(data)
    std_deviation = statistics.stdev(data)
    print("Max value in {}: {}".format(feature, data.max()))
    print("Min value in {}: {}".format(feature, data.min()))
    print("Average value in {}: {}".format(feature, avrg))
    print("Standard deviation value in {}: {}".format(feature, std_deviation), "\n")


def max_min_stats(dataset, feature, consider_zeros=False):
    """ Provides the first 10 maximum and minimum values of a given dataset column

        :param dataset: Dataframe with the dataset
        :param feature: str with the feature being analyzed
        :param consider_zeros: bool that indicates if zeros must be considered or not
                               for the stats
    """

    if consider_zeros:
        data = dataset.copy()
    else:
        data = dataset[dataset[feature] != 0].copy()

    data = data.sort_values(feature)
    print("Max values in {}: {}".format(feature, list(reversed(data[feature].values[-10:]))))
    print("Min values in {}: {}".format(feature, (data[feature].values[:10])), "\n")


diabetes = pd.read_csv('../datasets/diabetes.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
            'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


# Max, min, average and standard deviation values.
column_stats(dataset=diabetes, feature='Pregnancies', consider_zeros=True)
for feature in features[1:]:
    column_stats(dataset=diabetes, feature=feature)
print("\n\n")


# Top 10 max and min values
max_min_stats(dataset=diabetes, feature='Pregnancies', consider_zeros=True)
for feature in features[1:]:
    max_min_stats(dataset=diabetes, feature=feature)



Max value in Pregnancies: 17
Min value in Pregnancies: 0
Average value in Pregnancies: 3.8450520833333335
Standard deviation value in Pregnancies: 3.3166247903554 

Max value in Glucose: 199
Min value in Glucose: 44
Average value in Glucose: 121.6867627785059
Standard deviation value in Glucose: 30.528675044947494 

Max value in BloodPressure: 122
Min value in BloodPressure: 24
Average value in BloodPressure: 72.40518417462484
Standard deviation value in BloodPressure: 12.36931687685298 

Max value in SkinThickness: 99
Min value in SkinThickness: 7
Average value in SkinThickness: 29.153419593345657
Standard deviation value in SkinThickness: 10.44030650891055 

Max value in Insulin: 846
Min value in Insulin: 14
Average value in Insulin: 155.5482233502538
Standard deviation value in Insulin: 118.77289253024026 

Max value in BMI: 67.1
Min value in BMI: 18.2
Average value in BMI: 32.45746367239099
Standard deviation value in BMI: 6.924988332105903 

Max value in DiabetesPedigreeFunction: 

In [3]:
# PROCESSING INVALID VALUES

%matplotlib inline
import pandas as pd


diabetes = pd.read_csv('../datasets/diabetes.csv')
diabetes_cleaned = diabetes.copy()

rows_number = diabetes_cleaned.shape[0]
# Rows with blood glucose equal to 0 removed
diabetes_cleaned = diabetes_cleaned[diabetes_cleaned.Glucose != 0]
print(rows_number - diabetes_cleaned.shape[0], "rows with 0 Glucose removed")

rows_number = diabetes_cleaned.shape[0]
# Rows with blood pressure equal to 0 removed
diabetes_cleaned = diabetes_cleaned[diabetes_cleaned.BloodPressure != 0]
print(rows_number - diabetes_cleaned.shape[0], "rows with 0 Blood pressure removed")

rows_number = diabetes_cleaned.shape[0]
# Rows with BMI equal to 0 removed
diabetes_cleaned = diabetes_cleaned[diabetes_cleaned.BMI != 0]
print(rows_number - diabetes_cleaned.shape[0], "rows with 0 BMI removed", "\n\n")


diabetes_cleaned.to_csv('../datasets/diabetes_cleaned.csv')

print("Original dataset dimensions: {}".format(diabetes.shape))
print(diabetes.groupby('Outcome').size(), "\n\n")

print("Cleaned dataset dimensions: {}".format(diabetes_cleaned.shape))
print(diabetes_cleaned.groupby('Outcome').size())



5 rows with 0 Glucose removed
35 rows with 0 Blood pressure removed
4 rows with 0 BMI removed 


Original dataset dimensions: (768, 9)
Outcome
0    500
1    268
dtype: int64 


Cleaned dataset dimensions: (724, 9)
Outcome
0    475
1    249
dtype: int64
