## Python ML Pipeline

In [None]:
# this magic command will capture runtime of every cell in this notebook
%load_ext autotime

In [None]:
# display current working directory
%pwd

In [None]:
# path of the input dataset (user input required)
# input_path = input("Enter the path of the input dataset - ")

input_path = r"..\new_data\sample_10k.csv"

In [None]:
# column name of the target variable in the input data

target = 'dep_var';

In [None]:
# path of the output folder location (user input required)
# output_path = input("Enter the path of the output folder - ")

output_path = r"..\notebooks\outputs7"

In [None]:
# loading the required libraries
import numpy as np
import pandas as pdra
from pandas import Series
import pandas.core.algorithms as algos
import scipy.stats.stats as stats
import os, sys, re, glob, gc, time, klib, traceback, string
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from collections import Counter
from mlxtend.evaluate import bias_variance_decomp

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PowerTransformer, MinMaxScaler, StandardScaler, KBinsDiscretizer
from sklearn.feature_selection import SelectKBest, f_classif, RFE, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, matthews_corrcoef, log_loss
from boruta import BorutaPy
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import BaseEstimator, TransformerMixin

import random
random.seed(42)

# set up to ignore warnings
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format

In [None]:
# display function to display rows & columns with truncation
def display_all(df):
    with pd.option_context("display.max_rows", 10000, "display.max_columns", 10000): 
        display(df)

# function to check whether the path exists, if not creates a folder
def path_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# add_datepart converts datetime64 columns from df to many columns containing the information from the date. This applies changes inplace
def add_datepart(df, fldnames, drop=True, time=False, errors="raise"):
    if isinstance(fldnames,str): 
        fldnames = [fldnames]
    for fldname in fldnames:
        fld = df[fldname]
        fld_dtype = fld.dtype
        if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
            fld_dtype = np.datetime64

        if not np.issubdtype(fld_dtype, np.datetime64):
            df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
        targ_pre = re.sub('[Dd]ate$', '', fldname)
        attr = ['Year', 'Quarter', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
                'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
        if time: attr = attr + ['Hour', 'Minute', 'Second']
        for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
#         df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
        if drop: df.drop(fldname, axis=1, inplace=True)

In [None]:
# loading the input dataset into df
df = pd.read_csv(input_path, low_memory=False, parse_dates=["PURCHASE_DATE", "DISPOSED_DATE"])
print(df.shape)
display_all(df.head())

In [None]:
# rename target variable as perf_flag
df.rename(columns={target: 'perf_flag'}, inplace=True)

In [None]:
# replace fields that's entirely space (or empty) with NaN
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
print(df.shape)

In [None]:
# modify column names - change them to lowercase, strip spaces, replace space between words with underscore
df.columns = df.columns.str.strip().str.lower().str.replace(' ','_')
df.columns

### Data exploration

In [None]:
# checking the response rate of target variable
ax = sns.countplot(df['perf_flag'], label="Count")       
NR, R = df['perf_flag'].value_counts(normalize=True)*100
print("Non-Responders: {0:.2f}%".format(NR))
print("Responders: {0:.2f}%".format(R))

In [None]:
# saving the columns and their datatypes for review (user to review this to identify any discrepancies with column datatypes)
df.dtypes.to_csv(f'{dtype_path}\\Column_Datatypes_Original.csv')

display_all(df.head())

In [None]:
# changing dtype of certain columns from numeric to string type (user input required)

new_col_dtype = {'svc_vin_loyality_indx': 'str', 'sls_vin_loyalty_indx': 'str', 'svc_hhh_loyality_indx': 'str', 'family_composition': 'str', 'advg_home_owner': 'str', 'liquid_resources': 'str', 'target_net_worth_3_cd': 'str'}
df = df.astype(new_col_dtype)

In [None]:
# information about the dataset
df.info()

In [None]:
# separating features and target
y = df['perf_flag'].copy() # target
X = df.drop('perf_flag', axis=1) # features

In [None]:
# viewing dtypes of the columns
display_all(X.dtypes)

In [None]:
# percentage of missing values in each column of the input data, sorted in descending order
miss_perc_by_col = (X.isnull().sum()/len(X))*100
display_all(miss_perc_by_col[miss_perc_by_col > 0].sort_values(ascending=False))

In [None]:
# drop variables with >80% missing values
miss_perc_df = pd.DataFrame(miss_perc_by_col).reset_index()
miss_perc_df.columns = ['col_nm', 'miss_perc']

rm_miss_lst = miss_perc_df[miss_perc_df['miss_perc'] > 80]
rm_miss_lst = list(rm_miss_lst['col_nm'])
print('list of features with missing value >80% :', rm_miss_lst)

X.drop(rm_miss_lst, axis=1, inplace=True)
print(X.shape)

In [None]:
# get list of datetime variables
s = (X.dtypes == 'datetime64[ns]')
dt_time_cols = list(s[s].index)

print("Date time variables:")
display_all(dt_time_cols)

In [None]:
# replace datetime64 column with multiple columns containing date related information
add_datepart(X, dt_time_cols)
print(X.shape)
display_all(X.head())

In [None]:
# get list of numeric variables
s = (X.dtypes != 'object') & (X.dtypes != 'datetime64[ns]') & (X.dtypes != 'category')
numeric_cols = list(s[s].index)

print("Number of numeric varibles:", len(numeric_cols))
# display_all(numeric_cols)

In [None]:
# information about data
X.info()

#### Descriptive Statistics

In [None]:
# descriptive statistics of the input dataset
desc_stats = X.describe(percentiles=[0.25, .5, .75, .9, .95, .99, .995])
display_all(desc_stats)

In [None]:
# generating eda summary report - data deep dive
from pandas_profiling import ProfileReport

profile = ProfileReport(X, title='Pandas Profiling Report', html={'style':{'full_width':True}}, minimal=True)
profile.to_file(f"{eda_path}\\01_EDA_Summary_Report.html")

#### Visualization

In [None]:
# numeric data distribution and kde(kernel density estimation) by target vraiable
pp = PdfPages(f'{eda_path}\\03_Numeric_Data_Distribution.pdf')

for i in iter(X.select_dtypes(include = ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']).columns):
    tmp = X[i]
    sns.set_style("darkgrid")
    fig = plt.figure()
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True)
    
    sns.histplot(data=tmp, x=tmp, bins=20, kde=True, ax=ax1).set_title('Histogram and Kernel density estimation')
    sns.kdeplot(data=tmp, x=tmp, hue=y, ax=ax2)
    
    plt.tight_layout()
    pp.savefig(fig)
    plt.close()

pp.close()

In [None]:
# function to visualize categorical data distributions

# Imports
import matplotlib.ticker as ticker
from matplotlib.colors import LinearSegmentedColormap, to_rgb
from typing import Any, Dict, Optional, Tuple, Union

from klib.utils import (
    _corr_selector,
    _missing_vals,
    _validate_input_bool,
    _validate_input_int,
    _validate_input_range,
    _validate_input_smaller,
    _validate_input_sum_larger,
)

# Functions

# Categorical Plot
def custom_cat_plot(
    data: pd.DataFrame,
    figsize: Tuple = (18, 18),
    top: int = 3,
    bottom: int = 3,
    bar_color_top: str = "#5ab4ac",
    bar_color_bottom: str = "#d8b365",
):
    """ Two-dimensional visualization of the number and frequency of categorical features.
    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \
        is provided, the index/column information is used to label the plots
    figsize : Tuple, optional
        Use to control the figure size, by default (18, 18)
    top : int, optional
        Show the "top" most frequent values in a column, by default 3
    bottom : int, optional
        Show the "bottom" most frequent values in a column, by default 3
    bar_color_top : str, optional
        Use to control the color of the bars indicating the most common values, by \
        default "#5ab4ac"
    bar_color_bottom : str, optional
        Use to control the color of the bars indicating the least common values, by \
        default "#d8b365"
    cmap : str, optional
        The mapping from data values to color space, by default "BrBG"
    Returns
    -------
    Gridspec
        gs: Figure with array of Axes objects
    """

    # Validate Inputs
    _validate_input_int(top, "top")
    _validate_input_int(bottom, "bottom")
    _validate_input_range(top, "top", 0, data.shape[1])
    _validate_input_range(bottom, "bottom", 0, data.shape[1])
    _validate_input_sum_larger(1, "top and bottom", top, bottom)

    data = pd.DataFrame(data).copy()
    cols = data.select_dtypes(exclude=["number"]).columns.tolist()
    data = data[cols]

    if len(cols) == 0:
        print("No columns with categorical data were detected.")
        return None

    for col in data.columns:
        if data[col].dtype.name in ("category", "string"):
            data[col] = data[col].astype("object")

    fig = plt.figure(figsize=figsize)
    gs = fig.add_gridspec(nrows=6, ncols=len(cols), wspace=0.21)

    for count, col in enumerate(cols):
        n_unique = data[col].nunique(dropna=True)
        value_counts = data[col].value_counts()
        lim_top, lim_bot = top, bottom

        if n_unique < top + bottom:
            lim_top = int(n_unique // 2)
            lim_bot = int(n_unique // 2) + 1

        if n_unique <= 2:
            lim_top = lim_bot = int(n_unique // 2)

        value_counts_top = value_counts[0:lim_top]
        value_counts_idx_top = value_counts_top.index.tolist()
        value_counts_bot = value_counts[-lim_bot:]
        value_counts_idx_bot = value_counts_bot.index.tolist()

        if top == 0:
            value_counts_top = value_counts_idx_top = []

        if bottom == 0:
            value_counts_bot = value_counts_idx_bot = []

        data.loc[data[col].isin(value_counts_idx_top), col] = 10
        data.loc[data[col].isin(value_counts_idx_bot), col] = 0
        data.loc[((data[col] != 10) & (data[col] != 0)), col] = 5
        data[col] = data[col].rolling(2, min_periods=1).mean()

        value_counts_idx_top = [elem[:20] for elem in value_counts_idx_top]
        value_counts_idx_bot = [elem[:20] for elem in value_counts_idx_bot]
        sum_top = sum(value_counts_top)
        sum_bot = sum(value_counts_bot)

        # Barcharts
        
        ax_top = fig.add_subplot(gs[:1, count : count + 1])
        ax_top.set_title(col)
        ax_top.bar(
            value_counts_idx_top, value_counts_top, color=bar_color_top, width=0.85
        )
        ax_top.bar(
            value_counts_idx_bot, value_counts_bot, color=bar_color_bottom, width=0.85
        )
        ax_top.set(frame_on=False)
        ax_top.tick_params(axis="x", labelrotation=90)

        # Summary stats
        ax_bottom = fig.add_subplot(gs[1:2, count : count + 1])
        plt.subplots_adjust(hspace=0.075)
        ax_bottom.get_yaxis().set_visible(False)
        ax_bottom.get_xaxis().set_visible(False)
        ax_bottom.set(frame_on=False)
        ax_bottom.text(
            0,
            0,
            f"Unique values: {n_unique}\n\n"
            f"Top {lim_top} vals: {sum_top} ({sum_top/data.shape[0]*100:.1f}%)\n"
            f"Bot {lim_bot} vals: {sum_bot} ({sum_bot/data.shape[0]*100:.1f}%)",
            transform=ax_bottom.transAxes,
            color="#111111",
            fontsize=11,
        )

    gs.figure.suptitle(
        "Categorical data plot", x=0.5, y=0.91, fontsize=18, color="#111111"
    )

    return gs

In [None]:
# categorical data distribution at overall level and by perf_flag type
plot = custom_cat_plot(X.select_dtypes(include=['category', 'object']), (36, 20), top=5, bottom=5)
plot.figure.savefig(f"{eda_path}\\04_Categorical_Data_Distribution_01.pdf", bbox_inches='tight', dpi=100)
plt.close()

# distribution by target variable
def plot_fig(plot, pdf):
    try:
        fig = plot.draw()
    except:
        fig = plot
    pdf.savefig(fig.fig, height=10, width=18, dpi=500, bbox_inches='tight', pad_inches=0.5)
    plt.close()

pp = PdfPages(f'{eda_path}\\04_Categorical_Data_Distribution_02.pdf')
for i in iter(X.select_dtypes(include = ['category', 'object']).columns):    
    sns.set_style("darkgrid")
    g = sns.catplot(x=i, col="perf_flag", data=df, kind="count")
    if i.endswith('indx'):
        for ax in g.axes.ravel():
            ax.set_xticklabels(ax.get_xticklabels(), rotation=70)
    plot_fig(g, pp)

pp.close()

In [None]:
# generating correlation martrix
corr = np.round(X.corr().abs(),2)
corr = corr.style.background_gradient(cmap='Oranges')

In [None]:
# correlation matrix, abs correlations above threshold (>0.7)

corr_mat = klib.corr_mat(X.select_dtypes(include = ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']), split='high', threshold=0.7)

In [None]:
# storing the eda outputs into a excel
with pd.ExcelWriter(f'{eda_path}\\02_Data_Exploration.xlsx') as writer:
    pd.DataFrame(X.dtypes).to_excel(writer, sheet_name='InputData_Column_Datatypes')
    pd.DataFrame(miss_perc_df).to_excel(writer, sheet_name='Missing_Percentage')
    pd.DataFrame(desc_stats).to_excel(writer, sheet_name='Numeric_Descriptive_Stats')
    corr.to_excel(writer, sheet_name='Correlation_Matrix')
    corr_mat.to_excel(writer, sheet_name='Highly_Correlated_Features')

In [None]:
# split data into training and validation data
# The split is based on a random number generator. Supplying a numeric value to the random_state argument guarantees we get the same split every time we run this script

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.reset_index(drop=True, inplace=True)
X_valid.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)

print('X_train:', X_train.shape, ' X_valid:', X_valid.shape, ' y_train:', y_train.shape, ' y_valid:', y_valid.shape)

### Missing value treatment 

In [None]:
# Imputing missing values - Numeric columns

# Get names of columns with missing values
cols_with_missing = [col for col in X_train.select_dtypes(exclude=['category', 'object']).columns
                     if X_train.select_dtypes(exclude=['category', 'object'])[col].isnull().any()]

# Make copy to avoid changing original data (when imputing)
X_train_num = X_train.select_dtypes(exclude=['category', 'object'])
X_valid_num = X_valid.select_dtypes(exclude=['category', 'object'])

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_num[col + '_was_missing'] = X_train_num[col].isnull()
    X_valid_num[col + '_was_missing'] = X_valid_num[col].isnull()

# Mean Imputation (other available strategies - median, most frequent, constant)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_X_train_num = pd.DataFrame(imp_mean.fit_transform(X_train_num))
imputed_X_valid_num = pd.DataFrame(imp_mean.transform(X_valid_num))

# Imputation removed column names; put them back
imputed_X_train_num.columns = X_train_num.columns
imputed_X_valid_num.columns = X_valid_num.columns

In [None]:
# Imputing missing values - Categorical columns

# Make copy to avoid changing original data (when imputing)
X_train_cat = X_train.select_dtypes(include=['category', 'object'])
X_valid_cat = X_valid.select_dtypes(include=['category', 'object'])

# Imputation (other available strategy - most frequent)
imp_cat = SimpleImputer(strategy='constant', fill_value="NA")
imputed_X_train_cat = pd.DataFrame(imp_cat.fit_transform(X_train_cat))
imputed_X_valid_cat = pd.DataFrame(imp_cat.transform(X_valid_cat))

# Imputation removed column names; put them back
imputed_X_train_cat.columns = X_train_cat.columns
imputed_X_valid_cat.columns = X_valid_cat.columns

In [None]:
# Concatenating imputed numeric & categorical dataframes 
imputed_X_train = pd.concat([imputed_X_train_num, imputed_X_train_cat], axis=1)
imputed_X_valid = pd.concat([imputed_X_valid_num, imputed_X_valid_cat], axis=1)

print('imputed_X_train:', imputed_X_train.shape, ' imputed_X_valid:', imputed_X_valid.shape)
print('Sample rows from imputed_X_train:')
display_all(imputed_X_train.head())

### One-hot encoding for categorical features

In [None]:
# Get list of categorical variables
s = ((imputed_X_train.dtypes == 'object') | (imputed_X_train.dtypes == 'category'))
object_cols = list(s[s].index)

print("Number of Categorical variables: ", len(object_cols))
print(object_cols)

In [None]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(imputed_X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(imputed_X_valid[object_cols]))

OH_cols_train.columns = OH_encoder.get_feature_names(object_cols)
OH_cols_valid.columns = OH_encoder.get_feature_names(object_cols)

# One-hot encoding removed index; put it back
OH_cols_train.index = imputed_X_train.index
OH_cols_valid.index = imputed_X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = imputed_X_train.drop(object_cols, axis=1)
num_X_valid = imputed_X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
imputed_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
imputed_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [None]:
# print sample rows after one-hot encoding
print('imputed_X_train:', imputed_X_train.shape, ' imputed_X_valid:', imputed_X_valid.shape)
print('Sample rows from imputed_X_train:')
display_all(imputed_X_train.head())

In [None]:
imputed_X_train = imputed_X_train.astype(np.float32)
imputed_X_valid = imputed_X_valid.astype(np.float32)
# imputed_X_train = klib.convert_datatypes(imputed_X_train)
# imputed_X_valid = klib.convert_datatypes(imputed_X_valid)

print("imputed_X_train", imputed_X_train.info())
print("-"*60)
print("imputed_X_valid", imputed_X_valid.info())

### Feature selection (Prior to Variable transformation)

In [None]:
# feature selection with boruta (before variable transformation)
clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=5, random_state=0)

boruta_feature_selector = BorutaPy(clf, n_estimators=100, random_state=42, verbose=2, max_iter=50, perc=75)
boruta_feature_selector.fit(imputed_X_train.values, y_train.values)

In [None]:
# creating list of selected features (using boruta)
features = imputed_X_train.columns

final_features = list()
indexes = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indexes):
    final_features.append(features[x])
print('number of shortlisted features: ', len(final_features))
pd.DataFrame(final_features).to_csv(f"{fs_path}\\02_FeaturesList_Before_VarTransform_Boruta.csv", index=False, header=['column_names'])

In [None]:
# subset the input dataframe with selected features only
imputed_X_train = pd.DataFrame(boruta_feature_selector.transform(imputed_X_train.values), columns = final_features)
imputed_X_valid = pd.DataFrame(boruta_feature_selector.transform(imputed_X_valid.values), columns = final_features)

print('imputed_X_train:', imputed_X_train.shape, ' imputed_X_valid:', imputed_X_valid.shape)
print('Sample rows from imputed_X_train:')
display_all(imputed_X_train.head())

### Applying Variable transformations on Input features

In [None]:
# numeric col names
col_names = [f for f in imputed_X_train.columns if f in numeric_cols]

# compute offset value for each column in the dataset
offset_df = pd.DataFrame(imputed_X_train[col_names].min(), columns=['min_value'])
offset_df['offset_value'] = offset_df['min_value'].apply(lambda x: 0 if (x>1 or x==1) else (1 if x==0 else (1-x if (x>0 and x<1) else abs(x)+1)))

offset_df = offset_df['offset_value']
print(offset_df.shape)
display(offset_df.head())

In [None]:
# scaled dataset after adding offset values
scaled_X_train = imputed_X_train[col_names].add(offset_df, axis='columns')
scaled_X_valid = imputed_X_valid[col_names].add(offset_df, axis='columns')

# replace all values<1 with 1 in validation data
for x in col_names:
    scaled_X_valid[x] = np.where(scaled_X_valid[x] < 1, 1, scaled_X_valid[x])

print('scaled_X_train:', scaled_X_train.shape, ' scaled_X_valid:', scaled_X_valid.shape)
print('Sample rows from scaled_X_train:')
display_all(scaled_X_train.head())

In [None]:
# Variable transformations
# create the function transformer objects
log_transform = FunctionTransformer(np.log, validate=True)
sqrt_transform = FunctionTransformer(np.sqrt, validate=True)
reciprocal_transform = FunctionTransformer(np.reciprocal, validate=True)
exp_transform = FunctionTransformer(lambda x: x**(2), validate=True)
yeo_johnson_transform = PowerTransformer(method='yeo-johnson', standardize=False)
norm_transform = MinMaxScaler()
std_transform = StandardScaler()

# apply the transformation to the training data
X_train_log = pd.DataFrame(log_transform.transform(scaled_X_train[col_names]), columns=col_names)
X_train_sqrt = pd.DataFrame(sqrt_transform.transform(scaled_X_train[col_names]), columns=col_names)
X_train_reciprocal = pd.DataFrame(reciprocal_transform.transform(scaled_X_train[col_names]), columns=col_names)
X_train_exp = pd.DataFrame(exp_transform.transform(scaled_X_train[col_names]), columns=col_names)
X_train_yeo_johnson = pd.DataFrame(yeo_johnson_transform.fit_transform(scaled_X_train[col_names]), columns=col_names)
X_train_norm = pd.DataFrame(norm_transform.fit_transform(scaled_X_train[col_names]), columns=col_names)
X_train_std = pd.DataFrame(std_transform.fit_transform(scaled_X_train[col_names]), columns=col_names)

X_train_log = X_train_log.add_suffix('_log')
X_train_sqrt = X_train_sqrt.add_suffix('_sqrt')
X_train_reciprocal = X_train_reciprocal.add_suffix('_reciprocal')
X_train_exp = X_train_exp.add_suffix('_exp')
X_train_yeo_johnson = X_train_yeo_johnson.add_suffix('_yeo_johnson')
X_train_norm = X_train_norm.add_suffix('_norm')
X_train_std = X_train_std.add_suffix('_std')

# concatenate all transformed dfs into one
X_train_imp_transform = pd.concat([imputed_X_train, X_train_log, X_train_sqrt, X_train_reciprocal, X_train_exp, X_train_yeo_johnson, X_train_norm, X_train_std], axis=1)


# apply the transformation to the validation data
X_valid_log = pd.DataFrame(log_transform.transform(scaled_X_valid[col_names]), columns=col_names)
X_valid_sqrt = pd.DataFrame(sqrt_transform.transform(scaled_X_valid[col_names]), columns=col_names)
X_valid_reciprocal = pd.DataFrame(reciprocal_transform.transform(scaled_X_valid[col_names]), columns=col_names)
X_valid_exp = pd.DataFrame(exp_transform.transform(scaled_X_valid[col_names]), columns=col_names)
X_valid_yeo_johnson = pd.DataFrame(yeo_johnson_transform.transform(scaled_X_valid[col_names]), columns=col_names)
X_valid_norm = pd.DataFrame(norm_transform.transform(scaled_X_valid[col_names]), columns=col_names)
X_valid_std = pd.DataFrame(std_transform.transform(scaled_X_valid[col_names]), columns=col_names)

X_valid_log = X_valid_log.add_suffix('_log')
X_valid_sqrt = X_valid_sqrt.add_suffix('_sqrt')
X_valid_reciprocal = X_valid_reciprocal.add_suffix('_reciprocal')
X_valid_exp = X_valid_exp.add_suffix('_exp')
X_valid_yeo_johnson = X_valid_yeo_johnson.add_suffix('_yeo_johnson')
X_valid_norm = X_valid_norm.add_suffix('_norm')
X_valid_std = X_valid_std.add_suffix('_std')

# concatenate all transformed dfs into one
X_valid_imp_transform = pd.concat([imputed_X_valid, X_valid_log, X_valid_sqrt, X_valid_reciprocal, X_valid_exp, X_valid_yeo_johnson, X_valid_norm, X_valid_std], axis=1)

print('X_train_imp_transform:', X_train_imp_transform.shape, ' X_valid_imp_transform:', X_valid_imp_transform.shape)
print('Sample rows from X_train_imp_transform:')
display_all(X_train_imp_transform.head())

In [None]:
# binning numeric variables using KBinsDiscretizer
col_names = [f for f in X_train_imp_transform.columns if f in numeric_cols]

print("number of numeric cols in input data: ", len(col_names))

# create the function transformer objects
kmean = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='kmeans')

# apply the transformation to your data - training data
X_train_kmean = pd.DataFrame.sparse.from_spmatrix(kmean.fit_transform(X_train_imp_transform[col_names]))

# apply the transformation to your data - validation data
X_valid_kmean = pd.DataFrame.sparse.from_spmatrix(kmean.transform(X_valid_imp_transform[col_names]))

print('X_train_kmean:', X_train_kmean.shape, ' X_valid_kmean:', X_valid_kmean.shape)
print("Info of X_train_kmean: ", X_train_kmean.info())
print('Sample rows from X_train_kmean:')
display_all(X_train_kmean.head())

In [None]:
# creating column names for the one-hot encoding columns from KBinsDiscretizer
kmean_cols = []
for n in range(len(col_names)):
    for x in range(len(kmean.bin_edges_[n])-1):
        l_val = str(round(kmean.bin_edges_[n][x],4))
        h_val = str(round(kmean.bin_edges_[n][x+1],4))
        col_nm = str(col_names[n])
        tmp = col_nm+"_"+l_val+"_to_"+h_val
        kmean_cols.append(tmp)

In [None]:
# rename column names of kmean transformed columns
X_train_kmean.columns = X_valid_kmean.columns = kmean_cols
print('number of one-hot columns created:', len(kmean_cols))

# concating discretized numeric cols with former transformed dataset
X_train_imp_transform.reset_index(drop=True, inplace=True)
X_valid_imp_transform.reset_index(drop=True, inplace=True)
X_train_kmean.reset_index(drop=True, inplace=True)
X_valid_kmean.reset_index(drop=True, inplace=True)

X_train_imp_transform = pd.concat([X_train_imp_transform, X_train_kmean], axis=1)
X_valid_imp_transform = pd.concat([X_valid_imp_transform, X_valid_kmean], axis=1)

print('X_train_imp_transform:', X_train_imp_transform.shape, ' X_valid_imp_transform:', X_valid_imp_transform.shape)
print('sample rows from X_train_imp_transform:')
display_all(X_train_imp_transform.head())

In [None]:
# # # convert data types to store effectively
X_train_imp_transform = X_train_imp_transform.astype(np.float32)
X_valid_imp_transform = X_valid_imp_transform.astype(np.float32)

# # replace NaNs and infs with zeroes(if present)
X_train = pd.DataFrame(np.nan_to_num(X_train_imp_transform, posinf=0, neginf=0), columns=X_train_imp_transform.columns)
X_valid = pd.DataFrame(np.nan_to_num(X_valid_imp_transform, posinf=0, neginf=0), columns=X_valid_imp_transform.columns)

# print info about the dataset
print('Info of X_train: ')
print(X_train.info())
print('Presence of NaNs: ', np.any(np.isnan(X_train)))
print('Presence of Infs: ', np.all(np.isinf(X_train)))
print('Only Finite values: ', np.all(np.isfinite(X_train)))
print('-'*60)
print('Info of X_valid: ')
print(X_valid.info())
print('Presence of NaNs: ', np.any(np.isnan(X_valid)))
print('Presence of Infs: ', np.all(np.isinf(X_valid)))
print('Only Finite values: ', np.all(np.isfinite(X_valid)))

display_all(X_train.head())

### Removing features with low variance

In [None]:
# removing features with low variance (threshold=0.95)
threshold_n=0.95
sel = VarianceThreshold(threshold=(threshold_n* (1 - threshold_n) ))
sel.fit_transform(X_train)
sel_var_features = X_train.columns[sel.get_support(indices=True)]
print('Total number of features after removing low variance features:', len(sel_var_features))
      
# retaining only selected columns 
X_train = X_train[sel_var_features]
X_valid = X_valid[sel_var_features]

fs_lowvar = sel_var_features

pd.DataFrame(sel_var_features).to_csv(f"{fs_path}\\03_FeaturesList_after_removing_Low_Variance.csv", index=False, header=['column_names'])

### Selecting features based on Correlation

In [None]:
# generating correlation matrix
corr_mat = X_train.corr().abs()
corr_mat.head()

In [None]:
# select upper triangle of correlation matrix
upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool))
# upper.to_csv(f'{PATH}corr_matrix.csv', index=False)

# Find index of feature columns with correlation greater than 0.8
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

# Drop features 
X_train.drop(to_drop, axis=1, inplace=True)
X_valid.drop(to_drop, axis=1, inplace=True)

print("total number of features after removing highly correlated features: ", len(X_train.columns))

In [None]:
# generating correlation matrix
new_corr = X_train.corr().abs()

# generating the correlation heatmap
plt.subplots(figsize=(20, 12))
sns.heatmap(new_corr, vmax=.8)

### Feature selection after applying Variable transformations

In [None]:
# feature selection with Boruta (after variable transformations)
clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=5, random_state=0)

boruta_feature_selector = BorutaPy(clf, n_estimators=100, random_state=42, verbose=2, max_iter=50, perc=50)
boruta_feature_selector.fit(X_train.values, y_train.values)

In [None]:
# creating list of selected features (using boruta)
features = X_train.columns

boruta_features = list()
indexes = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indexes):
    boruta_features.append(features[x])
print('Number of features selected by Boruta: ', len(boruta_features))

In [None]:
# Feature Selection with Univariate Statistical Tests - ANOVA F value
# please check the 'k' value - number of features to be selected, this should be less than total # of features in train data

# summarize scores
m1_selector = SelectKBest(score_func=f_classif, k=100)
m1_selector.fit(X_train, y_train)

# Summarize scores
np.set_printoptions(precision=3)
# display_all(m1_selector.scores_)

# create list with selected features
m1_cols = m1_selector.get_support(indices=True)
m1_features = list(X_train.iloc[:,m1_cols].columns)
print('Number of features selected by ANOVA:', len(m1_features))

In [None]:
# print features and their ANOVA score
pd.options.display.float_format = '{:.2f}'.format
anova_df = pd.DataFrame(m1_selector.scores_, columns = ["ANOVA"], index=X_train.columns).reset_index()
anova_df.sort_values('ANOVA', ascending=0, inplace=True)
anova_df.reset_index(drop=True, inplace=True)
anova_df = anova_df[anova_df['index'].isin(m1_features)]  

display_all(anova_df.head())

In [None]:
# Feature selection with RFE(recursive feature elimination)
# please check the n_features_to_select, this should be less than total # of features in train data

model = LogisticRegression()
rfe = RFE(model, n_features_to_select=100)
fit = rfe.fit(X_train, y_train)
print("Number of features selected by RFE: %s" % (fit.n_features_))

# create list with selected features
feat_names = X_train.columns;
m2_selector=fit.get_support()
m2_features = list(feat_names[m2_selector])

In [None]:
rfe_df = pd.DataFrame(fit.support_, columns = ["RFE"], index=X_train.columns).reset_index()
rfe_df = (rfe_df[rfe_df['RFE'] == True]).reset_index(drop=True)

display_all(rfe_df.head())

In [None]:
# Feature selection with Extra Trees Classifier
model = ExtraTreesClassifier(n_estimators=50)
model.fit(X_train, y_train)

In [None]:
# normalizing feature importance metric
feature_imp_normalized = np.std([tree.feature_importances_ for tree in model.estimators_], axis = 0) 

# list of selected features (using feature importance from Extratrees classifier)
idx = np.arange(0, X_train.shape[1]) #create an index array, with the number of features

m3_selector = idx[feature_imp_normalized > np.mean(feature_imp_normalized)]
print("Number of features selected by Extratrees Classifier: ", len(m3_selector))
feature_names = X_train.columns;
m3_features = list(feature_names[m3_selector])

In [None]:
# creating a dataframe with selected features from extratrees classifier
vi_df = pd.DataFrame(model.feature_importances_, columns = ["Extratrees"], index=X_train.columns).reset_index()
vi_df.sort_values(['Extratrees'], ascending=0, inplace=True)
vi_df.reset_index(drop=True, inplace=True)
vi_df = vi_df[vi_df['index'].isin(m3_features)]  
vi_df.reset_index(drop=True, inplace=True)

display_all(vi_df.head())

In [None]:
# final list of features, which are common across atleast three feature selection methods
min_threshold = 3
combined_features = (boruta_features) + (m1_features) + (m2_features) + (m3_features)
feat_cnt = Counter(combined_features)

for key, cnts in list(feat_cnt.items()):   
    if cnts < min_threshold:
        del feat_cnt[key]
        
final_feature_list = list(feat_cnt.keys())
print("Number of features in final list:", len(final_feature_list))

In [None]:
# save the feature selection results into a excel
with pd.ExcelWriter(f'{fs_path}\\04_Feature_Selection_Results.xlsx') as writer:
    pd.DataFrame(boruta_features, columns=['Selected Features']).to_excel(writer, sheet_name='Boruta')
    pd.DataFrame(anova_df).to_excel(writer, sheet_name='Univariate_Anova')
    pd.DataFrame(rfe_df[rfe_df['RFE'] == True]).to_excel(writer, sheet_name='RFE')
    pd.DataFrame(vi_df).to_excel(writer, sheet_name='ExtratreesClassifier Importance')
    pd.DataFrame(final_feature_list, columns=['Selected Features']).to_excel(writer, sheet_name='Final Features List')

In [None]:
# Updated training & validation datasets with final features only
X_train = X_train[final_feature_list]
X_valid = X_valid[final_feature_list]

print('X_train:', X_train.shape, ' X_valid:', X_valid.shape, ' y_train:', y_train.shape, ' y_valid:', y_valid.shape)

In [None]:
#Applying multicollinearity to remove columns which are dependent on each other, Threshold=2.5

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=2.5, impute=True, impute_strategy='median'):
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = SimpleImputer(missing_values=np.nan, strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=2.5):
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={np.round(max_vif,2)}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X

In [None]:
# Remove columns having higher VIF factor ot having high multicollinearity
vif = ReduceVIF()
X_train = vif.fit_transform(X_train)
X_valid = X_valid[list(X_train.columns)]

print('X_train:', X_train.shape, ' X_valid:', X_valid.shape, ' y_train:', y_train.shape, ' y_valid:', y_valid.shape)

In [None]:
# computing VIF for the final list of features
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return(vif)

vif = calculate_vif(X_train)

In [None]:
# print the final of features wth their VIF & feature importances
model = ExtraTreesClassifier(n_estimators=50)
model.fit(X_train, y_train)

vi_df = pd.DataFrame(model.feature_importances_, columns = ["Importances"], index=X_train.columns).reset_index()
vi_df.sort_values(['Importances'], ascending=0, inplace=True)
vi_df.columns = ['Features', 'Importances']

vif = vif.merge(vi_df, on='Features')
vif.to_csv(f"{fs_path}\\05_Feature_Importances_with_VIF.csv", index=False)
display_all(vif)