<b>Dependencies:</b> <br>
    %matplotlib inline <br>
    import pandas as pd <br>
    import random <br>

    from sklearn.utils import shuffle <br>
    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
    from sklearn.feature_selection import RFE


In [1]:
# FEATURE SELECTION GRADIENT BOOSTING

import pandas as pd


# Models
from sklearn.ensemble import GradientBoostingClassifier
# Feature selection
from sklearn.feature_selection import RFE


diabetes_cleaned = pd.read_csv('../datasets/diabetes_cleaned.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
            'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = diabetes_cleaned[features]
y = diabetes_cleaned.Outcome


model = GradientBoostingClassifier()
gb_rfe = RFE(estimator=model, step=1)
gb_rfe.fit(X, y)

features.reverse()
selected_features = []
for _, feature_selected in enumerate(gb_rfe.support_):
    if feature_selected:
        selected_features.append(features.pop())
    else:
        # Discarded
        features.pop()

print(selected_features)



['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [5]:
# FEATURE SELECTION RANDOM FOREST

import pandas as pd

# Models
from sklearn.ensemble import RandomForestClassifier
# Feature selection
from sklearn.feature_selection import RFE


diabetes_cleaned = pd.read_csv('../datasets/diabetes_cleaned.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
            'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = diabetes_cleaned[features]
y = diabetes_cleaned.Outcome


model = RandomForestClassifier()
gb_rfe = RFE(estimator=model, step=1)
gb_rfe.fit(X, y)

features.reverse()
selected_features = []
for _, feature_selected in enumerate(gb_rfe.support_):
    if feature_selected:
        selected_features.append(features.pop())
    else:
        # Discarded
        features.pop()

print(selected_features)



['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [3]:
# DATA AUGMENTATION 75-25

import pandas as pd
import random
from sklearn.utils import shuffle


diabetes_cleaned = pd.read_csv('../datasets/diabetes_cleaned.csv')
train_data = diabetes_cleaned.loc[:543] #75% of samples
test_data = diabetes_cleaned.loc[543:]  #25% of samples
train_data_augmented = train_data.copy()

selected_features_int = ['Glucose', 'Age', 'BloodPressure', 'Insulin', 'Pregnancies', 'SkinThickness']

# One decimal value
selected_features_float_one = ['BMI']

# Three decimal value
selected_features_float_three = ['DiabetesPedigreeFunction']

times_repeated = 100
for _ in range(times_repeated):
    # random_variation takes values [0.01, 0.10[
    random_variation = 0.01 + random.random()/10
    df = train_data.copy()

    # Glucose and Age
    for feature in selected_features_int:
        for pos, value in enumerate(df[feature]):
            # df[feature][pos] = value  is translated to  df.__getitem__(feature).__setitem__(pos, value)
            # It is hard to predict if __getitem__ will return a view or a copy: the new value may not be changed!
            # To avoid unexpected results and SettingWithCopyWarning the following approach will be used instead
            df.at[pos, feature] = int((value + random.uniform(-value, + value) * random_variation))
    # BMI
    for feature in selected_features_float_one:
        for pos, value in enumerate(df[feature]):
            df.at[pos, feature] = round(value + random.uniform(-value, + value) * random_variation, 1)
    # DiabetesPedigreeFunction
    for feature in selected_features_float_three:
        for pos, value in enumerate(df[feature]):
            df.at[pos, feature] = round(value + random.uniform(-value, + value) * random_variation, 3)
    train_data_augmented = train_data_augmented.append(df, sort=False, ignore_index=True)


# Avoiding index column being saved
train_data_augmented = train_data_augmented.drop(columns=['Unnamed: 0'])
# Shuffling the rows
train_data_augmented = shuffle(train_data_augmented)
train_data_augmented.reset_index(inplace=True, drop=True)

train_data.to_csv('../datasets/diabetes_train_data_75pc.csv')
test_data.to_csv('../datasets/diabetes_test_data_25pc.csv')
train_data_augmented.to_csv('../datasets/diabetes_train_data_75pc_100times_10.csv')

print("Dataset dimensions after DA: {}".format(len(train_data_augmented)))



Dataset dimensions after DA: 54944


In [2]:
# DATA AUGMENTATION 80-20

import pandas as pd
import random
from sklearn.utils import shuffle


diabetes_cleaned = pd.read_csv('../datasets/diabetes_cleaned.csv')
train_data = diabetes_cleaned.loc[:579] #80% of samples
test_data = diabetes_cleaned.loc[579:]  #20% of samples
train_data_augmented = train_data.copy()

selected_features_int = ['Glucose', 'Age', 'BloodPressure', 'Insulin', 'Pregnancies', 'SkinThickness']

# One decimal value
selected_features_float_one = ['BMI']

# Three decimal value
selected_features_float_three = ['DiabetesPedigreeFunction']

times_repeated = 100
for _ in range(times_repeated):
    # random_variation takes values [0.01, 0.10[
    random_variation = 0.01 + random.random()/10
    df = train_data.copy()

    # Glucose and Age
    for feature in selected_features_int:
        for pos, value in enumerate(df[feature]):
            # df[feature][pos] = value  is translated to  df.__getitem__(feature).__setitem__(pos, value)
            # It is hard to predict if __getitem__ will return a view or a copy: the new value may not be changed!
            # To avoid unexpected results and SettingWithCopyWarning the following approach will be used instead
            df.at[pos, feature] = int((value + random.uniform(-value, + value) * random_variation))
    # BMI
    for feature in selected_features_float_one:
        for pos, value in enumerate(df[feature]):
            df.at[pos, feature] = round(value + random.uniform(-value, + value) * random_variation, 1)
    # DiabetesPedigreeFunction
    for feature in selected_features_float_three:
        for pos, value in enumerate(df[feature]):
            df.at[pos, feature] = round(value + random.uniform(-value, + value) * random_variation, 3)
    train_data_augmented = train_data_augmented.append(df, sort=False, ignore_index=True)


# Avoiding index column being saved
train_data_augmented = train_data_augmented.drop(columns=['Unnamed: 0'])
# Shuffling the rows
train_data_augmented = shuffle(train_data_augmented)
train_data_augmented.reset_index(inplace=True, drop=True)

train_data.to_csv('../datasets/diabetes_train_data_80pc.csv')
test_data.to_csv('../datasets/diabetes_test_data_20pc.csv')
train_data_augmented.to_csv('../datasets/diabetes_train_data_80pc_100times_10.csv')

print("Dataset dimensions after DA: {}".format(len(train_data_augmented)))



Dataset dimensions after DA: 58580
