In [15]:
import time
import pandas as pd
import big_o
import missingno as mso
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from category_encoders import BinaryEncoder, HelmertEncoder, WOEEncoder, LeaveOneOutEncoder, JamesSteinEncoder, MEstimateEncoder
from sklearn.feature_extraction import FeatureHasher

In [16]:
df = pd.read_csv('E:\\product\\Test_Data\\supervised Data\\train.csv')
start_time_ = time.time()
# identify columns which have missing values
def columns_with_missing(data):
        '''
        funcion that search for the columns with missing data also find the 
        percentage of missing data in each columns and build and return a
        dataframe with two columns one is features with missing data and
        other column  have percentage of values missing in that feature
        args:
            data -> our unprocessed dataframe

        '''
        # percentage of missing values in each column
        percentage_miss=((data.isnull().sum()/data.isnull().count())*100).sort_values(ascending=False)
        # build a dataframe    
        missing_df = pd.concat([percentage_miss], axis=1, keys=['%missing'])
        missing_df.insert(0, 'feature', missing_df.index)
        # conditions for filtering 
        cond1 = missing_df['%missing']>0 
        cond2 = missing_df['%missing']<50
        cond3 = missing_df['%missing']<=10
        cond4 = missing_df['%missing']>10
        final_cond_1= cond1 & cond2 & cond4
        final_cond_2= cond1 & cond3
        # list of columns with 50% or more missing values
        col_greater_than_50per = missing_df.loc[missing_df['%missing'] >50]['feature'].tolist()
        # list of columns with less than 10% missing values
        col_less_than_10per = missing_df.loc[final_cond_2]['feature'].tolist()
        # remaning columns
        remaining_col = missing_df.loc[final_cond_1]
        # reset the index of dataframe 
        remaining_col.reset_index(level=0, drop=True, inplace=True)
        # convert remaining_col into a dictionary because we have only two columns left so ease for searching
        remaining_co_l = remaining_col['feature'].tolist()
        return remaining_co_l, col_greater_than_50per, col_less_than_10per

# perform operations on column
def operations(col_greater50, col, col_less10, data):
    
    '''
    Imputation Techniques
    '''
    # mean imputation
    def mean_imputation(data, list_of_col):
        for col in list_of_col:
            data[col] = data[col].fillna(value=data[col].mean())
        return data
    # mode imputation
    def median_imputation(data, list_of_col):
        for col in list_of_col:
            data[col] = data[col].fillna(value=data[col].median())
        return data
    # median imputation
    def mode_imputation(data, list_of_col):
        for col in list_of_col:
            data[col] = data[col].fillna(value=data[col].mode()[0])
        return data
    '''
    Deletion techniques
    '''
    # columns with more than 50% values are missing must be dropped
    col_greater50 = [col for col in col_greater50 if data[col].dtypes != np.object]
    data.drop(col_greater50, axis=1, inplace=True)
    # delete the rows with missing data
    col_less10 = [col for col in col_less10 if data[col].dtypes != np.object]
    data.dropna(how='any', subset=col_less10, inplace=True)

# run both the function 
colums, colums50, columns10 = columns_with_missing(df)
operations(col_greater50=colums50, col=colums, col_less10=columns10, data=df)
# mean imputation
df['LotFrontage'] = df['LotFrontage'].fillna(value=df['LotFrontage'].median())
print(df['LotFrontage'].median())
df['FireplaceQu'] = df['FireplaceQu'].fillna(value=df['FireplaceQu'].mode()[0])
print(df['FireplaceQu'].mode()[0])

70.0
Gd


# CATEGORICAL VARAIBLE  ENCODING TECHNIQUES
   ## 1.OneHotEncoding

In [3]:
# using sklearn OneHotEncoder
def sk_onehotencoder(data):
    # Identify the categorical features
    features = [x for x in data.columns if x not in ['Id', 'SalePrice'] and data[x].dtypes == np.object]
        
    # fill the NaN values with NONE
    for col in features:
        data.loc[:, col] = data.fillna('NONE').astype(str)

    # one hot encoder
    ohe = OneHotEncoder()
    # fit on data
    ohe.fit(data[features])
    # new_data
    train_data = ohe.transform(data[features])
    return train_data
# a sparse matrix
_df = sk_onehotencoder(data=df)

# using get_dummies function
def pd_getdummies(data):
    # using pandas get_dummies function
    _data = pd.get_dummies(data)
    return _data
# a dataframe
df_GD = pd_getdummies(data=df)

## 2.Dummy Variable encoding

In [4]:
# also using get_dummies but slightest change
def dummyVarencode(data):
    # using get_dummies
    _data = pd.get_dummies(data, drop_first=True)
    return _data
# our dataframe is
df_DVE = dummyVarencode(data=df)

## 3.Binary encoding

In [5]:
# number of class k basis p lagani hai
def binary_encoder(data):
    # extract all features which are object
    categorical_features = [col for col in data.columns if data[col].dtypes == np.object]
    # using binary encoder
    BE = BinaryEncoder(cols=categorical_features)
    _data = BE.fit_transform(df[categorical_features])
    return _data
# let's see
df_BE = binary_encoder(data=df)
# make a copy of df
df_BE_2 = df.copy()
#  drop all the categorical column first from orignal dataframe
df_BE_2.drop(columns=[col for col in df.columns if df[col].dtypes == np.object], inplace=True)
#  final dataframe
df_BE_Final = pd.concat([df_BE_2, df_BE], axis=1)

  elif pd.api.types.is_categorical(cols):


## 4.Frequency encoding

In [6]:
def frequency_encoder(data):
    # create a copy of data
    _data = data.copy()
    # find the categorical features
    features = [col for col in data.columns if data[col].dtypes == np.object]
    print(len(features))
    # iterate over each col in list of
    # features and apply frequency encodin
    # to each column and map these values
    # in orignal column
    for feat in features:
        # find the probability with respect to total values
        freq = _data.groupby(feat).size()/len(_data)
        # map this values in the orignal column
        _data[feat] = _data[feat].map(freq)
    # return the data
    return _data
# let's see
df_FE = frequency_encoder(data=df)

43


## 5.Helmert Encoder

In [7]:
features = [col for col in df.columns if df[col].dtypes == np.object]
def helmert_encoder(data,features):
    _data = data.copy()
    # create an instance of helmert encoder
    encoder = HelmertEncoder(cols=features, drop_invariant=True)
    dfh = encoder.fit_transform(df[features])
    _data.drop(columns=features, axis=1, inplace=True)
    _data = pd.concat([_data,dfh], axis=1)
    return _data
# let's see
df_HE = helmert_encoder(data=df, features=features)

  elif pd.api.types.is_categorical(cols):


## 6. Target Encoding

In [8]:
def target_encoder(data, features, target):
    # build copy of a dataset
    _data = data.copy()
    # find mean of each column with respect
    # to target variable and map them in 
    # orignal column
    for col in features:
        # mean wrt target
        mean_encode = _data.groupby(col)[target].mean()
        # map in column
        _data[col] = _data[col].map(mean_encode)
    return _data
df_TE = target_encoder(data=df, features=features, target = 'SalePrice')

# target encoding using smoothing
def smoothing(data, features, target):
    # build copy of data
    _data = data.copy()
    # compute the global mean
    mean = _data[target].mean()
    for col in features:
        # compute the number of values and mean of each group
        agg = _data.groupby(col)[target].agg(['count', 'mean'])
        counts = agg['count']
        means = agg['mean']
        weight = 100
        # compute the 'smoothed' mean
        smooth = (counts * means + weight * mean)/(counts + weight)
        # replace each value by it's sooth mean
        _data[col] = _data[col].map(smooth)
    return _data
df_SE = smoothing(data=df, features=features, target='SalePrice')

## 7.Weight of evidence encoding

In [14]:
'''
ONLY WORK ON CALSSIFICATION PROBLEM MAINLY WHEN TARGET HAS BINARY CLASS
'''
def WOF_encoder(data, features, target):
    # build copy of data
    _data = data.copy()
    # create an instance of WOEEncoder
    WOE = WOEEncoder(cols=features, regularization=0.5, random_state=100)
    _data[features] = WOE.fit_transform(_data[features], _data[target])


## 8.Leave One Out encoding

In [11]:
def Leave_one_Out_Encoder(data, features, target):
    # build copy of data
    _data = data.copy()
    # create an instance of leave one out encoder
    LOOE = LeaveOneOutEncoder(cols=features, sigma=0.2)
    _data[features] = LOOE.fit_transform(_data[features], _data[target])
    return _data
df_LOOE = Leave_one_Out_Encoder(data=df, features=features, target='SalePrice')

  elif pd.api.types.is_categorical(cols):


## 9.James-Stein encoding

In [17]:
def james_stein_encoder(data, features, target):
    # build copy of data
    _data = data.copy()
    # create an instance of james-stein encoder
    js_encode = JamesSteinEncoder(cols=features, drop_invariant=True, sigma= 0.2, random_state=10)
    _data[features] = js_encode.fit_transform(_data[features], _data[target])
    return _data
df_JSE = james_stein_encoder(data=df, features=features, target='SalePrice')

  elif pd.api.types.is_categorical(cols):


## 10.M Estimator Encoding

In [19]:
def m_estimator_encoder(data, features, target):
    # build copy of data
    _data = data.copy()
    # create an instance of M estimator encoder
    mee = MEstimateEncoder(cols=features, m=2, drop_invariant=True)
    _data[features] = mee.fit_transform(_data[features], _data[target])
    return _data
df_MEE = m_estimator_encoder(data=df, features=features, target='SalePrice')

  elif pd.api.types.is_categorical(cols):


many more techniques will come soon