[Reference](https://towardsdatascience.com/simple-yet-effective-data-preprocessing-toolbox-416f551a12c)

# Binning

In [1]:
def fixed_width_cut(df,feature,labels=['Low','Medium','High']):
  feature_slice, retbins = pd.cut(df[feature], len(labels) ,retbins=True, labels=labels)
  retbins = [ '%.2f' % elem for elem in retbins ]
  return feature_slice,retbins

def quartile_cut(df,feature,labels=['Low','Medium','High']):
  feature_slice, retbins = pd.qcut(df[feature], q=len(labels),retbins=True,labels=labels)
  retbins = [ '%.2f' % elem for elem in retbins ]
  return feature_slice,retbins

# Impute Null Values

In [2]:
def impute_plot(df,features, strategy='median'):
  impute_by_median = SimpleImputer(strategy=strategy)
  cleaned_features = impute_by_median.fit_transform(df[features])
  return cleaned_features

# Remove Outliers

In [3]:
import scipy.stats as stats

def remove_outlier_using_z_score (df, column):
  z_scores = stats.zscore(df[column])
  abs_z_scores = np.abs(z_scores)
  filtered_entries = (abs_z_scores < 3)
  return df[filtered_entries]

def compare_plot(df_list,x,y,subtitle,figsize=(25,10)):
  fig, axes = plt.subplots(nrows=len(df_list),figsize=figsize)
  fig.suptitle(subtitle, fontsize=16)
  for i,df in enumerate(df_list):
    sns.boxplot(x=x, y=y, data=df, ax=axes[i])

# Normalization (Feature Scaling)

In [4]:
import sklearn.preprocessing as preproc

def scale_feature(df,features,strategy='minmax'):
  if strategy=='minmax':
    scale = preproc.minmax_scale(df[features])
  elif strategy=='standard':
    scale = preproc.StandardScaler().fit_transform(df[features])
  elif strategy == 'l2':
    scale = preproc.normalize(df[features],axis=0)
  return scale

# Log Transform

In [5]:
def log_transform_feature(df,feature):
  one_log_feature = np.log10(df[feature] + 1)
  two_log_feature = np.log10(one_log_feature + 1)
  return one_log_feature, two_log_feature

# One Hot Encoding

In [6]:
from sklearn.preprocessing import OneHotEncoder

def create_onehot_encoder_df (df,feature):
  encoder = OneHotEncoder(handle_unknown='ignore')
  feature_df = df[~df[feature].isnull()]
  encoder.fit(feature_df[[feature]])

  feature_lists = encoder.get_feature_names()
  feature_encode =pd.DataFrame(encoder.transform(feature_df[[feature]]).toarray(), columns = feature_lists)
  return pd.merge(feature_df, feature_encode ,left_index=True, right_index=True)