In [47]:
import pandas as pd,numpy as np,matplotlib.pyplot as plt, seaborn as sns,warnings,statistics,scipy.stats as stats,pylab
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler,OneHotEncoder,LabelEncoder
warnings.filterwarnings('ignore')
%matplotlib inline

# TRAIN, VALIDATION, TEST

In [None]:
Train,Valid,Test = np.split(df.sample(frac=1), [int(0.6*len(df)),int(0.8*len(df))])

# DATA MINING

In [3]:
def Data_Mining(df):
    numerical = [feature for feature in df.columns if df[feature].dtype != 'O']
    discrete = [feature for feature in numerical if df[feature].nunique()<25]
    continuous = [feature for feature in numerical if feature not in discrete]
    categorical = [feature for feature in df.columns if df[feature].dtype=='O']
    return numerical,discrete,continuous,categorical

In [4]:
def Feature_NAN(df):
    numerical,discrete,continuous,categorical = Data_Mining(df)
    na_numerical = [feature for feature in numerical if df[feature].isnull().sum()>0]
    na_discrete = [feature for feature in discrete if df[feature].isnull().sum()>0]
    na_continuous = [feature for feature in continuous if df[feature].isnull().sum()>0]
    na_categorical = [feature for feature in categorical if df[feature].isnull().sum()>0]
    return na_numerical,na_discrete,na_continuous,na_categorical

In [5]:
# numerical,discrete,continuous,categorical = Data_Mining(X)
# na_numerical,na_discrete,na_continuous,na_categorical = Feature_NAN(X)

# 1. MISSING VALUE
1.1. NUMERICAL
+ DISCRETE BEST WAY IS MODE
+ CONTINUOUS BEST WAY IS MEDIAN

1.1.1. MEAN, MODE, MEDIAN IMPUTATION

In [6]:
def Replace_measure_numerical(df,features,impute):
    data = df.copy()
    for feature in features:
        if impute =='median':
            value = data[feature].median()
        elif impute =='mean':
            value = data[feature].mean()
        else:
            value = statistics.mode(data[feature])
        data[feature] = data[feature].fillna(value)
    return data

1.1.2. RANDOM SAMPLE IMPUTATION

In [7]:
def Replace_randomsample_numerical(df,features):
    data = df.copy()
    for feature in features:
        random_sample = data[feature].dropna().sample(data[feature].isnull().sum(),random_state=0)
        random_sample.index = data[data[feature].isnull()].index
        data.loc[data[feature].isnull(),feature] = random_sample
    return data

1.1.3. END OF DISTRIBUTION IMPUTATION

In [8]:
# for feature in na_continuous:
#     plt.figure(figsize=(10,2))
#     sns.distplot(df_hp[feature])

In [9]:
def Replace_end_numerical(df,features,location):
    data = df.copy()
    for feature in features:
        if location == 'left':
            extreme = data[feature].mean()-3*data[feature].std()
        else:
            extreme = data[feature].mean()+3*data[feature].std()
        
        data[feature] = data[feature].fillna(extreme)
    return data

1.1.4. ARBITRAY IMPUTATION (0,-1,99,-100)

In [10]:
def Replace_arbitray_numerical(df,features,value):
    data = df.copy()
    data[features] = data[features].fillna(value)
    return data

1.1.5. BEFORE (pad/ffill) OR AFTER(bfill/backfill)

In [11]:
def Replace_befor_after_numerical(df,features,method,limit=2):
    data = df.copy()
    data[features] = data[features].fillna(method=method,limit=limit)
    return data

1.1.6. INTERPOLATE

In [12]:
def Replace_interpolate_numerical(df,features,method,order):
    data = df.copy()
    data[features] = data[features].interpolate(method=method,order=order)
    return data

In [13]:
# df_feature_engineer = Replace_measure_numerical(df,na_numerical,'median')
# df_feature_engineer = Replace_randomsample_numerical(df,na_numerical)
# df_feature_engineer = Replace_end_numerical(df,na_continuous,'right')
# df_feature_engineer = Replace_arbitray_numerical(df,na_numerical,-99)
# df_feature_engineer = Replace_befor_after_numerical(df,na_numerical,'bfill')
# df_feature_engineer = Replace_interpolate_numerical(df,na_numerical,'polynomial',5)

1.2. CATEGORICAL (less nan values: Mode)

1.2.1. MISSING

In [14]:
def Replace_missing_cat(df,features,value):
    data = df.copy()
    data[features] = data[features].fillna(value)
    return data

1.2.2. FREQUENT CATEGORICAL (LESS NAN VALUES)

In [15]:
def Replace_frequent_cat(df,features):
    data = df.copy()
    for feature in features:
        frequent = data[feature].value_counts().index[0]
        data[feature] = data[feature].fillna(frequent)
    return data

In [16]:
# df_feature_engineer = Replace_missing_cat(df,na_categorical,'Missing')
# df_feature_engineer = Replace_frequent_cat(df,na_categorical)

1.2.3. TRAIN/TEST 

1.2.4. UNSUPERVISED ML

In [18]:
def Replace_kmeans_cat(df,feature):
    data = df.copy()
    train_cluster = data.dropna(axis=0).drop(columns=feature)
    kmeans = KMeans(n_clusters = data[feature].nunique()).fit(train_cluster)
    data.loc[data[feature].notnull(),feature] = kmeans.labels_
    test_cluster = data.dropna(axis=1).loc[data[data[feature].isnull()].index,]
    data.loc[data[feature].isnull(),feature] = kmeans.predict(test_cluster)
    return data

In [19]:
# X_train = Replace_kmeans_cat(X_train,'KJ')

---

# 2. ENCODING
+ CHANGE CATEGORICAL VARIABLES LESS THAN 1% EQUAL 'RARE VAR'
+ FOUND WHICH FEATURE IS HIGH CARDINALITY

In [20]:
def Rare_Var(df,features):
    data = df.copy()
    for feature in features:
        temp = data.groupby(feature)[feature].count()/len(data)
        temp_df = temp[temp>0.01].index
        data[feature] = np.where(data[feature].isin(temp_df),data[feature],'Rare Var')
    return data

In [21]:
def Cardinality(df,categorical):
    high_cardinality = [feature for feature in categorical if df[feature].nunique()>15]
    normal_cardinality = [feature for feature in categorical if df[feature].nunique()>5 and feature not in high_cardinality]
    low_cardinality = [feature for feature in categorical if feature not in high_cardinality+normal_cardinality]
    return high_cardinality,normal_cardinality,low_cardinality

In [22]:
# df_feature_engineer = Rare_Var(X,categorical)
# high_cardinality,normal_cardinality,low_cardinality = Cardinality(X,categorical)

2.1. Normial

2.1.1. HOT ENCODING

In [23]:
def One_hot_encoding(df,features):
    data = df.copy()
    for feature in features:
        encoder = OneHotEncoder(drop='first')
        encoded_rank = encoder.fit_transform(data[feature].values.reshape(-1,1)).toarray()
        encoded_rank = pd.DataFrame(encoded_rank,columns=feature+'_'+data[feature].value_counts().index.sort_values()[1:])
        data = pd.concat([data,encoded_rank],axis=1)
        data.drop(columns=feature,inplace=True)
    return data

2.1.2. TOP 10

In [24]:
def TOP_encoding(df,features,top):
    data = df.copy()
    for feature in features:
        top_x = [label for label in data[feature].value_counts().head(top).index]
        for label in top_x:
            data[feature+'_'+str(label)] = np.where(data[feature]==label,1,0)
        data.drop(columns=feature,inplace=True)
    return data

2.1.3. COUNT/FREQUENT CATEGORICAL ENCODING

In [25]:
def Count_encoding(df,features):
    data = df.copy()
    for feature in features:
        data[feature+'_'+'count'] = data[feature].map(data[feature].value_counts())
        data.drop(columns=feature,inplace=True)
    return data

2.1.4. MEAN ENCODING (TARGET: BINARY)

In [26]:
def Mean_encoding(df,features,target):
    data = df.copy()
    for feature in features:
        data[feature+'_'+'mean_target'] = data[feature].map(data.groupby([feature])[target].mean())
        data.drop(columns=feature,inplace=True)
    return data

2.1.5. PROBABILITY RATIO ENCODING

In [27]:
def Probability_ratio_encoding(df,features,target):
    data = df.copy()
    for feature in features:
        prob = data.groupby([feature])[target].mean()
        data[feature+'_'+'prob_ratio'] = data[feature].map(prob/(1-prob))
        data.drop(columns=feature,inplace=True)
    return data

In [28]:
# df_feature_engineer = One_hot_encoding(X,low_cardinality)
# df_feature_engineer = TOP_encoding(X,high_cardinality,top=10)
# df_feature_engineer = Count_encoding(X,normal_cardinality)
# df_feature_engineer = Mean_encoding(X,categorical,y.name)
# df_feature_engineer = Probability_ratio_encoding(X,categorical,y.name)

2.2. ORDINAL

2.2.1. LABEL ENCODING
+ from sklearn.preprocessing import LabelEncoder: this transformer should be used to encode TARGET VALUES, i.e. y, and not the INPUT X.

In [29]:
def Label_encoding_target(target):
    label_encoder = LabelEncoder()
    target = label_encoder.fit_transform(pd.DataFrame(target))
    return target

In [30]:
def Label_encoding_sklearn(X_train, X_test):
    ordinal_encoder = OrdinalEncoder()
    ordinal_encoder.fit(X_train)
    X_train = ordinal_encoder.transform(X_train)
    X_test = ordinal_encoder.transform(X_test)
    return X_train,X_test

In [31]:
def Label_encoding(df,features):
    data = df.copy()
    for feature in features:
        labels_sort = data[feature].value_counts().index.sort_values()
        labels_ordinal = {label:ordinal for ordinal,label in enumerate(labels_sort,1)}
        data[feature+'_'+'label'] = data[feature].map(labels_ordinal)
        data.drop(columns=feature,inplace=True)
    return data

2.2.2. TARGET GUIDED ORDINAL ENCODING

In [32]:
def Target_guided_encoding(df,features,target):
    data = df.copy()
    for feature in features:
        labels_sort = data.groupby([feature])[target].mean().sort_values().index
                                        
        labels_ordinal = {label:ordinal for ordinal,label in enumerate(labels_sort,1)}
                                                                  
        data[feature+'_'+'target_ordinal'] = data[feature].map(labels_ordinal)
        data.drop(columns=feature,inplace=True)
    return data

2.2.3. COMBINE COLUMNS (relationship nonlinear between feature and feature)

2.2.4. BINS (AGE)

In [33]:
def Bins_label(df,features):
    data = df.copy()
    for feature in features:
        labels = data[feature]
        bins =  np.select([labels<10,labels<20,labels<30,labels<40,labels<50,labels<60],[0,1,2,3,4,5],6)
        data['Age'+'_'+'bins'] = pd.DataFrame(bins)
    return data

In [34]:
# y = Label_encoding_target(y)
# X_train,X_test = Label_encoding_sklearn(X_train, X_test)
# df_feature_engineer = Label_encoding(X,low_cardinality)
# df_feature_engineer = Target_guided_encoding(X,categorical,y.name)
# df_feature_engineer = Bins_label(X,categorical)

# 3. IMBALANCED
3.1. Under Sampling
+ Lấy số lượng thiểu số tổng thể chia tỷ lệ rồi làm tròn xuống thì bằng số lượng mẫu đa số (Tỷ lệ cân bằng phải cao hơn ban đầu)
+ Số lượng mẫu lớn sau under phải cao hơn n_neighbors (mặc định bằng 3: có thể thay đổi)
+ Các feature đều là biến số
3.2. Over Sampling
+ Không cần tất cả là number
+ Lấy số lượng đa số tổng thể nhân tỉ lệ rồi làm tròn xuống thì ra số lượng mẫu thiểu số (mẫu thiểu số lấy từ tổng thể thiểu số đã có)
+ Tỷ lệ cân bằng phải cao hơn ban đầu
3.3. SMOTETomek
+ Cần tất cả là number
+ Tổng thể đa số giảm, tổng thể thiểu số giảm và tăng thêm điểm hoàn toàn mới (nằm trong vùng tổng thể thiểu số)
+ Số lượng mẫu đa số nhân tỷ lệ rồi làm tròn ra số lượng mẫu thiểu số
+ Tỷ lệ % đa số cũ giảm xuống tỷ lệ % đa số mới bao nhiêu thì tỷ lệ % thiểu số cũ tăng lên tỷ lệ % thiểu số mới là bấy nhiêu sao cho số lượng đa số mới nhân tỷ lệ rồi làm tròn ra số lượng thiểu số mới.

3 biến phân loại đầu ra

In [35]:
# strategy = {0:5699, 1:5699, 2:5699}
# oversample = SMOTETomek(sampling_strategy=strategy)
# X, y = oversample.fit_resample(X_feature_engineer,y_feature_engineer)
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)
# Train_scale,test_scale = Scale(StandardScaler(),X_train,X_test)

In [36]:
def Imbalanced(X,y,ratio,method):
    if method == 'undersampling':
        X_balanced,y_balanced = NearMiss(ratio).fit_resample(X,y)
    elif method == 'oversampling':
        X_balanced,y_balanced = RandomOverSampler(ratio).fit_resample(X,y)
    else:
        X_balanced,y_balanced = SMOTETomek(ratio).fit_resample(X,y)
    return X_balanced,y_balanced

In [37]:
# X_train,y_train = Imbalanced(X,y,0.95,'SMOTETomek')

# 4. Treating Outliers
+ Using scatter plots
+ Box plot
+ Using z-core: nếu dữ liệu có phân phối chuẩn
+ Using IQR (Interquartile Range): nếu không có phân phối chuẩn
# IQR & ZSCORE:
   + Triming ouliers
   + Capping outlier

4.1. IQR

In [38]:
def IQR_method(df,features,method):
    data = df.copy()
    for feature in features:
        q1 = data[feature].quantile(0.25)
        q3 = data[feature].quantile(0.75)
        iqr = q3 - q1
        upper_whisker = q3 + (1.5*iqr)
        lower_whisker = q1 - (1.5*iqr)
        points = data[feature]
        if method == 'Capping':
            data[feature] = np.where(points>upper_whisker, upper_whisker,np.where(points<lower_whisker, lower_whisker, points))
        else:
            drop_outliers = np.array([])
            indexes = points[(points > upper_whisker)|(points<lower_whisker)].index
            drop_outliers = np.append(drop_outliers,indexes)
            dropped = np.unique(drop_outliers)
            data = data.drop(labels=dropped)
    data.reset_index(inplace=True)
    return data

4.2. Zscore

In [39]:
def Zscore_method(df,features,thresh,method):
    data = df.copy()
    for feature in features:
        mean = data[feature].mean()
        std = data[feature].std()
        upper_bound = mean + thresh*std
        lower_bound = mean - thresh*std
        points = data[feature]
        if method == 'Capping':
            data[feature] = np.where(points>upper_bound,upper_bound,np.where(points<lower_bound,lower_bound, points))
        else:
            drop_outliers = np.array([])
            indexes = points[(points > upper_bound)|(points<lower_bound)].index
            drop_outliers = np.append(drop_outliers,indexes)
            dropped = np.unique(drop_outliers)
            data = data.drop(labels=dropped)
    data.reset_index(drop=True,inplace=True)
    return data

In [40]:
# df_feature_engineer = IQR_method(X,discrete,'Triming')
# df_feature_engineer = Zscore_method(X,continuous,thresh = 3,'Triming')

# 5. SCALING

5.1. Check Skewness: Q-Q plot

In [41]:
def QQ_plot(df,features):
    for feature in features:
        fig,axs = plt.subplots(1,3,figsize=(20,6))
        plt.rcParams['font.family']='Arial'
        plt.rcParams['font.size']=13
        sns.set_style('dark')
        sns.distplot(df[feature], label="skew: " + str(np.round(df[feature].skew(),2)),ax = axs[0])
        axs[0].legend()
        sns.boxplot(df[feature],ax=axs[2])
        axs[2].set_title('Box Plot')
        stats.probplot(df[feature],dist='norm',plot=axs[1])
        axs[0].set_title('Distribution')
        fig.tight_layout()

In [42]:
def Distribution(df):
    normal_feature = [feature for feature in df.columns if np.abs(np.round(df[feature].skew(),2)) < 0.5]
    skew_feature = [feature for feature in df.columns if np.abs(np.round(df[feature].skew(),2)) >= 0.5]
    return normal_feature,skew_feature

In [43]:
# QQ_plot(X,continuous)
# normal_feature, skew_feature = Distribution(X[numerical])

5.2. Not gaussian disrtribution
+ If you want to check whether feature is guassian or normal distributed: Q-Q plot
+ #If value equal 0 use np.log1p (add 1 for all point)
+ BoxCox Transformation
+ The Box-Cox transformation is defined as: T(Y)=(Y exp(λ)-1)/λ

+ where Y is the response variable and λ is the transformation parameter. λ varies from -5 to 5. In the transformation, all values of λ are considered and the optimal value for a given variable is selected.

In [44]:
def Gaussion_transformation(types,features,df):
    data = df.copy()
    for feature in features:
        if types == 'Logarithmic' and 0 in data[feature].unique():
            data[feature] = np.log1p(data[feature])
        elif types == 'Logarithmic' and 0 not in data[feature].unique():
            data[feature] = np.log(data[feature])
        elif types == 'Reciprocal' and 0 not in data[feature].unique():
            data[feature] = np.divide(1,data[feature])
        elif types == 'SquareRoot' and data[feature].unique()>=0:
            data[feature] = np.sqrt(data[feature])
        elif types == 'Exponential':
            data[feature] = np.power(data[feature],2)
        elif types == 'BoxCox':
            data[feature],parameters = stats.boxcox(data[feature])
        else:
            data[feature] = data[feature]
    return data

5.3. Scaling

In [45]:
def Scale(types,train,test):
    scaler = types
    df_train = pd.DataFrame(scaler.fit_transform(train),columns=train.columns)
    df_test = pd.DataFrame(scaler.transform(test),columns=test.columns)
    return df_train,df_test

In [46]:
# df_feature_engineer = Gaussion_transformation('Logarithmic',skew_feature,X)
# X_train_scale,X_test_scale = Scale(StandardScaler(),X,X_test)

In [None]:
# def scale_dataset(df,oversample=False):
#     x = df[df.cols[:-1]].values
#     y = df[df.cols[-1]].values
#     scaler = StandardScaler()
#     X = scaler.fit_transform(X)
#     if oversample:
#         ros = RandomOverSampler()
#         X,y = ros.fit_resample(X,y)
#     data np.hstack((X,np.reshape(y,(-1,1))))
#     return data,X,y
# train, X_train, y_train = scale_dataset(train,oversample=True)
# valid, X_valid, y_valid = scale_dataset(valied,oversample=True)
# test, X_test, y_test = scale_dataset(test,oversample=False)