In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../Data/train.csv")

In [3]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,79.0,0,0,Yes,Self-employed,Rural,112.64,28.5,formerly smoked,0
1,Female,62.0,0,0,Yes,Private,Urban,88.32,36.3,Unknown,0
2,Female,21.0,0,0,No,Private,Rural,59.52,33.7,never smoked,0
3,Male,31.0,0,0,Yes,Govt_job,Urban,65.7,30.4,formerly smoked,0
4,Female,31.0,0,0,No,Private,Rural,59.63,19.9,never smoked,0


In [4]:
# TODO : To create a helper function for label encoding
def apply_label_encoding(dfrm):
    '''This function takes dataframe as input,
    Checks if it is already encoded, if not encodes the feature
    and returns a dataframe with encoded values'''
    binary_columns = [feat for feat in dfrm.columns if dfrm[feat].nunique() == 2 and str(dfrm[feat].dtype) != "int64"]
    le = LabelEncoder()
    for i in binary_columns:
        dfrm[i] = le.fit_transform(dfrm[i])
    return dfrm

In [5]:
# TODO : To create a helper function for OneHotEncoding
def apply_OneHotEncoding(dfrm):
    '''This function select the non binary features and applies
    OneHotEncoding to the features. It drops the firest values of 
    encoded features to avoid dummy variable trap.'''
    # select the non binary columns
    one_hot_cols = [feat for feat in dfrm.columns if dfrm[feat].nunique() > 2 and str(dfrm[feat].dtype) == 'object']
    # create an istance of one hot encoding, drop first value to avoid dummy variable trap, and set "saparse" value to False
    one = OneHotEncoder(drop="first", dtype=int, sparse=False)
    # slice the dataframe and chose the features to apply one hot encoding
    to_enc = dfrm[one_hot_cols]
    # fit and transform the features
    one.fit(to_enc)
    encoded = one.transform(to_enc)
    # get the name of the column
    cols = [name[3:] for name in list(one.get_feature_names())]
    # add the feature name and it's subsequent values to the dataframe
    dfrm[cols] = encoded
    # drop the original columns and return the new dataframe
    dfrm.drop(one_hot_cols, axis=1, inplace=True)
    return dfrm

In [6]:
# TODO : To Create a helper function for scaling 
def apply_scaling(train, test):
    '''This function select the numerical columns and applies scaling.
    NOTE : It takes two dataframes, applies fit method on train data,
    and apply transform to both train and test data'''
    # select the numerical columns
    scale_cols = [feat for feat in train.columns if str(train[feat].dtype) == "float64"]
    # create an instance of standard scaler
    scaler = StandardScaler()
    # slice the dataframe to apply feature scaling
    train_scale = train[scale_cols]
    test_scale = test[scale_cols]
    # fit the scaler to subset
    scaler.fit(train_scale)
    train_scaled = scaler.transform(train_scale)
    # apply tranformation to test data
    test_scaled = scaler.transform(test_scale)
    # save the scaled values and return the dataframe
    train[scale_cols] = train_scaled
    test[scale_cols] = test_scaled
    return train, test

In [7]:
# TODO : Split the dataset into training and testing
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)

In [8]:
# TODO : Apply scaling and encoding to features
X_train, X_test = apply_scaling(X_train, X_test)
X_train = apply_label_encoding(X_train)
X_test = apply_label_encoding(X_test)
X_train = apply_OneHotEncoding(X_train)
X_test = apply_OneHotEncoding(X_test)

In [148]:
# TODO : Handle imbalance in data using oversampling technique
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
oversample = ADASYN(sampling_strategy = 0.6, random_state=42)
undersample = RandomUnderSampler(sampling_strategy = 0.8, random_state=42)
steps = [('o', oversample), ('u', undersample)]
pipe = Pipeline(steps=steps)

In [152]:
from collections import Counter

print(f"Before oversampling: {Counter(y_train)}")
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
print(f"After oversampling: {Counter(y_train_res)}")
X_train_res, y_train_res = undersample.fit_resample(X_train_res, y_train_res)
print(f"After undersampling: {Counter(y_train_res)}")

Before oversampling: Counter({0: 2639, 1: 113})
After oversampling: Counter({0: 2639, 1: 1575})
After undersampling: Counter({0: 1968, 1: 1575})


In [155]:
# TODO : Save the scaled and transformed data as ".csv" file in "Data" directory
X_train.to_csv("../Data/X_train.csv")
X_train_res.to_csv("../Data/X_train_res.csv")
X_test.to_csv("../Data/X_test.csv")
y_train.to_csv("../Data/y_train.csv")
y_train_res.to_csv("../Data/y_train_res.csv")
y_test.to_csv("../Data/y_test.csv")

In [None]:
# END