In [None]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max.columns', 100)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from calendar import monthrange 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.manifold import TSNE
pd.set_option('display.width', 1000) 
pd.set_option ('display.max_columns' , 1000)
pd.set_option("display.max_rows", 1000)
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.linear_model import Lasso

In [2]:
%config InlineBackend.figure_format = 'retina'

In [3]:
def make_log_data (x_data, elim_columns = [], sum_num_to_data = 0):
    data = x_data.copy()
    columns = data.columns
    for col in columns:
        if col in elim_columns:
            continue
        else:
            if data[col].dtype == 'float64':
                list_index = list(data[data[col] != 0][col].index)
                data.loc[list_index, col] = data.loc[list_index, col].apply(np.log10)
    return data
        

In [4]:
def drow_data (x_data, y_data):
    tsne = TSNE(random_state=17)

    X_tsne = tsne.fit_transform(x_data)

    plt.figure(figsize=(12,10))
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_data, 
                edgecolor='none', alpha=0.7, s=40,
                cmap=plt.cm.get_cmap('nipy_spectral', 2))
    plt.colorbar()
    plt.title('MNIST. t-SNE projection');

In [5]:
def mean_target_enc1(train_df, y_train, valid_df, skf):
    import warnings
    warnings.filterwarnings('ignore')
    
    glob_mean = y_train.mean()
    train_df = pd.concat([train_df, pd.Series(y_train, name='y')], axis=1)
    new_train_df = train_df.copy()
    
    cat_features = train_df.columns[train_df.dtypes == 'object'].tolist()    

    for col in cat_features:
        new_train_df[col + '_mean_target'] = np.nan

    for train_idx, valid_idx in skf.split(train_df, y_train):
        train_df_cv, valid_df_cv = train_df.iloc[train_idx, :], train_df.iloc[valid_idx, :]

        for col in cat_features:
            
            means = valid_df_cv[col].map(train_df_cv.groupby(col)['y'].mean())
            valid_df_cv[col + '_mean_target'] = means.fillna(glob_mean)
            
        new_train_df.iloc[valid_idx] = valid_df_cv
    
    for col in cat_features:
        means = valid_df[col].map(train_df.groupby(col)['y'].mean())
        valid_df[col + '_mean_target'] = means.fillna(glob_mean)
    
    return new_train_df, valid_df

In [6]:
def mean_target_enc_train(train_df, y_train, skf):
    import warnings
    warnings.filterwarnings('ignore')
    
    glob_mean = y_train.mean()
    train_df = pd.concat([train_df, pd.Series(y_train, name='y')], axis=1)
    new_train_df = train_df.copy()
    
    cat_features = train_df.columns[train_df.dtypes == 'object'].tolist()    

    for col in cat_features:
        new_train_df[col + '_mean_target'] = np.nan

    for train_idx, valid_idx in skf.split(train_df, y_train):
        train_df_cv, valid_df_cv = train_df.iloc[train_idx, :], train_df.iloc[valid_idx, :]

        for col in cat_features:
            
            means = valid_df_cv[col].map(train_df_cv.groupby(col)['y'].mean())
            valid_df_cv[col + '_mean_target'] = means.fillna(glob_mean)
            
        new_train_df.iloc[valid_idx] = valid_df_cv
    new_train_df.drop(cat_features + ["y"], axis=1, inplace=True)
    return new_train_df

In [7]:
def determine_bad_features (Model, x_data, y_data, n_splits_on_CV):
    cv = StratifiedKFold(n_splits= n_splits_on_CV, random_state= 17)
    features = x_data.columns
    differences = []
    percent_dif = []
    std = []
    percent_std = []
    cv_before_drop_feature = cross_val_score(Model, x_data, 
                                                 y_data, scoring='roc_auc', n_jobs= -1, cv = cv)
    for feature in features:
        cut_x_data = x_data.drop(columns=[feature])
        cv_after_drop_feature = cross_val_score(Model, cut_x_data, 
                                                 y_data, scoring='roc_auc', n_jobs= -1, cv = cv)
        difference = cv_after_drop_feature.mean() - cv_before_drop_feature.mean()
        percent_df = (cv_after_drop_feature.mean()/cv_before_drop_feature.mean() - 1) * 100
        std_dif = cv_after_drop_feature.std() - cv_before_drop_feature.std() 
        percent_std_ = (cv_after_drop_feature.std()/cv_before_drop_feature.std() - 1) * 100
        differences.append(difference)
        percent_dif.append(percent_df)
        std.append(std_dif)
        percent_std.append(percent_std_)
    df = pd.DataFrame({'Feature': pd.Series(features), 'CV_Diff': pd.Series(differences), 
                       'CV_Diff %': pd.Series(percent_dif), 
                       'Diff Std of CV': pd.Series(std),'Diff Std of CV %': pd.Series(percent_std)})
    sorted_df = df.sort_values(by = 'CV_Diff', ascending= True)
    return sorted_df.set_index(np.arange(1, len(features) + 1))

In [8]:
def sort_fetures (Model , x_data, y_data, n_splits_on_CV):
    cv = StratifiedKFold(n_splits= n_splits_on_CV, random_state= 17)
    features = x_data.columns
    for i in range (len(features)):
        impact_of_all_features = determine_bad_features(Model , x_data, y_data , n_splits_on_CV)
        if impact_of_all_features.loc[impact_of_all_features.shape[0] - 1, 'CV_Diff'] > 0:
            the_worst_feature = impact_of_all_features.loc[impact_of_all_features.shape[0] - 1, 'Feature']
            x_data = x_data.drop(columns = [the_worst_feature])
        else:
            break
    Excepted_features = list(set(features) - set(x_data.columns))
    return x_data.columns , ('Excepted_features', Excepted_features)

In [9]:
def fill_nan(table):
    for col in table.columns:
        if table[col].dtype == 'int64' or table[col].dtype == 'float64':
            table[col] = table[col].fillna(table[col].mean())
    return table 

In [10]:
def onehotencoding_data (data, encoding_more = True, limit =  4):
    cat_features = []
    float_int_features = []
    if encoding_more:
        for col in data.columns:
            if data[col].dtype == 'object'and (2 < data[col].nunique() <= limit):
                cat_features.append(col)
            elif data[col].dtype == 'object'and data[col].nunique() == 2:
                dict_col = dict(data[col].value_counts())
                mark = 0
                for key in dict_col:
                    dict_col[key] = mark
                    mark += 1
                data[col] = data[col].map(dict_col)
                float_int_features.append(col)
            else:
                float_int_features.append(col)
        one_hot_data = pd.get_dummies(data[cat_features])
        return pd.concat([data[float_int_features],one_hot_data], axis = 1)
    else:
        for col in data.columns:
            if data[col].dtype == 'object'and data[col].nunique() == 2:
                dict_col = dict(data[col].value_counts())
                mark = 0
                for key in dict_col:
                    dict_col[key] = mark
                    mark += 1
                data[col] = data[col].map(dict_col)
        return data

In [11]:
def comply_good_df (Model, x_data , y_data, cv, show_elim_features = False):
    Inner_skf = StratifiedKFold(n_splits= cv, random_state= 17)
    features = x_data.columns.tolist()
    complied_df = pd.DataFrame()
    best_temp_score = 0
    best_gen_score = 0
    for iter in range(x_data.shape[1]):
        for feature in features:
            temp_df = complied_df.copy()
            temp_df[feature] = x_data[feature]
            feature_scores = cross_val_score(Model, temp_df, y_data, cv = Inner_skf,
                                             scoring= 'roc_auc', n_jobs= -1)
            mean_score_with_feature =  feature_scores.mean()
            if mean_score_with_feature > best_temp_score:
                best_temp_score = mean_score_with_feature
                best_feature = feature
        if best_temp_score > best_gen_score:
            complied_df[best_feature] = x_data[best_feature]
            features.remove(best_feature)
            best_gen_score = best_temp_score
        else:
            break
    elim_features = list(set(x_data.columns) - set(complied_df.columns))
    if show_elim_features:
        return complied_df, elim_features
    else:
        return complied_df

In [12]:
def box_plot_all_num_features_by_class (x_data, y_data, figsize = (20, 30)):
    inner_df = pd.concat([x_data, y_data], axis = 1)
    num_features = x_data.shape[1]
    ncols = 3
    nrows = (num_features // ncols) + 1
    fig, axes = plt.subplots(nrows= nrows, ncols=3, figsize=(20, 30))
    for idx, feat in enumerate(x_data.columns):
        ax = axes[int(idx/3) , idx%3]
        sns.boxplot(x = 'Target', y = feat, data = inner_df, ax = ax)
        ax.set_xlabel('')
        ax.set_ylabel(feat)
    fig.tight_layout();

In [13]:
def tree_features (x_data, y_data, deep = 3,add_features = False, use_each_column = False, deep_each_col = 1):
    List_features = x_data.columns.tolist()
    Tree_features = pd.DataFrame()
    if use_each_column:
        Tree = DecisionTreeClassifier(max_depth= deep_each_col, random_state= 17)
        for feature in List_features:
            Tree.fit(np.array(x_data[feature]).reshape(-1, 1), y_data)
            Threshold = Tree.tree_.threshold.tolist()
            for Thresh in Threshold:
                if Thresh == -2:
                    continue
                else:
                    if add_features:
                        x_data[feature+ ' '+ '<=' + ' '+str(Thresh)] = x_data[feature].apply(lambda x:int(x <= Thresh))
                    else:
                        Tree_features[feature+ ' '+ '<=' + ' '+str(Thresh)] = x_data[feature].apply(lambda x:int(x <= Thresh))
        if add_features:
            return x_data
        else:
            return  Tree_features
    else:
        Tree = DecisionTreeClassifier(max_depth= deep, random_state= 17)
        Tree.fit(x_data, y_data)
        Features = Tree.tree_.feature.tolist()
        Threshold = Tree.tree_.threshold.tolist()
        for Thresh, Feature in [*zip(Threshold, Features)]:
            if Thresh == -2 and Feature ==  -2:
                continue
            else:
                if add_features:
                    Col = List_features[Feature]
                    x_data[Col+ ' '+ '<=' + ' '+str(Thresh)] = x_data[Col].apply(lambda x:int(x <= Thresh))
                else:
                    Col = List_features[Feature]
                    Tree_features[Col+ ' '+ '<=' + ' '+str(Thresh)] = x_data[Col].apply(lambda x:int(x <= Thresh))
        if add_features:
            return x_data
        else:
            return  Tree_features

In [14]:
def add_model_output (Model, x_data, y_data, num_cross_val = 5, add_feature = True):
    x_data = x_data.copy()
    import warnings
    warnings.filterwarnings('ignore')
    Inner_SKF = StratifiedKFold(n_splits=  num_cross_val, shuffle= True, random_state= 17)
    Model_outputs = pd.Series(np.zeros(x_data.shape[0]))
    ROC_AUC = []
    for train_df_id , apply_df_id in Inner_SKF.split(x_data, y_data):
        train_df_x = x_data.iloc[train_df_id, :]
        train_df_Y = y_data[train_df_id]
        apply_df = x_data.iloc[apply_df_id, :]
        Model.fit(train_df_x, train_df_Y)
        out_put = pd.Series(Model.predict_proba(apply_df)[:, 1])
        Model_outputs[apply_df_id] = out_put
        roc_auc = roc_auc_score(y_data[apply_df_id], out_put)
        ROC_AUC.append(roc_auc)
    if add_feature:
        x_data[str(Model).split('(')[0]] = Model_outputs
        return x_data
    else:
        return Model_outputs


In [15]:
def optimize_df_model_feature (Model, x_data, y_data, num_cross_val = 5, replace_output = True):
    x_data = x_data.copy()
    inner_skf = StratifiedKFold(n_splits= num_cross_val, shuffle= True, random_state= 17)
    mean_score = 0
    previous_mean_score = 0
    iteration = 0
    while mean_score >= previous_mean_score:
        if replace_output:
            Out_put = add_model_output(Model,
                                       x_data, 
                                       y_data, 
                                       num_cross_val = num_cross_val,
                                       add_feature = False)
            previous_df = x_data.copy()
            x_data[str(Model).split('(')[0]] = Out_put
            inner_scores = cross_val_score(Model, 
                                           x_data, y_data, cv = inner_skf, scoring='roc_auc')
            previous_mean_score = mean_score
            mean_score = inner_scores.mean()
        else:
            Out_put = add_model_output(Model,
                                       x_data, 
                                       y_data, 
                                       num_cross_val = num_cross_val,
                                       add_feature = False)
            previous_df = x_data.copy()
            x_data[str(Model).split('(')[0] + str(iteration)] = Out_put
            inner_scores = cross_val_score(Model, x_data, y_data,
                                           cv = inner_skf, scoring = 'roc_auc')
            previous_mean_score = mean_score
            mean_score = inner_scores.mean()
            iteration += 1
    return previous_df

In [16]:
def tree_features (x_data, y_data, deep = 3,add_features = False, use_each_column = False, deep_each_col = 1):
    x_data = x_data.copy()
    List_features = x_data.columns.tolist()
    Tree_features = pd.DataFrame()
    if use_each_column:
        Tree = DecisionTreeRegressor(max_depth= deep_each_col, random_state= 17)
        for feature in ['Avg ° C', 'Millimetres','SUN Total Hours', 'SUN Clear Days', 'Humidity State']:
            Tree.fit(np.array(x_data[feature]).reshape(-1, 1), y_data)
            Threshold = Tree.tree_.threshold.tolist()
            for Thresh in Threshold:
                if Thresh == -2:
                    continue
                else:
                    if add_features:
                        x_data[feature+ ' '+ '<=' + ' '+str(Thresh)] = x_data[feature].apply(lambda x:int(x <= Thresh))
                    else:
                        Tree_features[feature+ ' '+ '<=' + ' '+str(Thresh)] = x_data[feature].apply(lambda x:int(x <= Thresh))
        if add_features:
            return x_data
        else:
            return  Tree_features
    else:
        Tree = DecisionTreeRegressor(max_depth= deep, random_state= 17)
        Tree.fit(x_data, y_data)
        Features = Tree.tree_.feature.tolist()
        Threshold = Tree.tree_.threshold.tolist()
        for Thresh, Feature in [*zip(Threshold, Features)]:
            if Thresh == -2 and Feature ==  -2:
                continue
            else:
                if add_features:
                    Col = List_features[Feature]
                    x_data[Col+ ' '+ '<=' + ' '+str(Thresh)] = x_data[Col].apply(lambda x:int(x <= Thresh))
                else:
                    Col = List_features[Feature]
                    Tree_features[Col+ ' '+ '<=' + ' '+str(Thresh)] = x_data[Col].apply(lambda x:int(x <= Thresh))
        if add_features:
            return x_data
        else:
            return  Tree_features

In [17]:
def tree_features_on_cv (Model, x_data, y_data, scoring = 'roc_auc', num_cv = 5, 
                         shuffle = True, deep_tree = 1, use_each_column = False, deep_each_col = 1):
    x_data = x_data.copy()
    Inner_SKF = StratifiedKFold(random_state= 17, n_splits=num_cv, shuffle = shuffle)
    scores = []
    for train_df_id, apply_df_id in Inner_SKF.split(x_data, y_data):
        train_df_x = x_data.iloc[train_df_id, :]
        train_df_y = y_data[train_df_id]
        apply_df_x = x_data.iloc[apply_df_id, :]
        apply_df_y = y_data[apply_df_id]
        train_df_x_tree = tree_features(train_df_x, 
                                        train_df_y,
                                        use_each_column= use_each_column, 
                                        deep_each_col= deep_each_col,
                                        deep= deep_tree,
                                        add_features=True)
        Model.fit(train_df_x_tree, train_df_y)
        Features = (tree_features(train_df_x, 
                                        train_df_y,
                                        use_each_column= use_each_column, 
                                        deep = deep_tree,
                                        deep_each_col= deep_each_col,
                                        add_features = False)).columns
        apply_df_x_treee = apply_tree_features_to_test(Features, apply_df_x)
        if scoring == 'r2':
            Y_pred = Model.predict(apply_df_x_treee)
            r_2 = r2_score(Y_pred, apply_df_y)
            scores.append(r_2)
        elif scoring == 'roc_auc':
            Y_pred = Model.predict_proba(apply_df_x_treee)
            roc_auc = roc_auc_score(Y_pred, apply_df_y)
            scores.append(roc_auc)
    return np.array(scores)

In [18]:
def model_out_put_on_cv (Model, x_data, y_data, num_cv = 5, num_cv_model = 5,  shuffle = True, scoring = 'roc_auc'):
    x_data = x_data.copy()
    Inner_SKF = StratifiedKFold(random_state= 17, n_splits=num_cv, shuffle = shuffle)
    warnings.filterwarnings('ignore')
    scores = []
    mean_inner_scores = []
    for train_id, test_id in Inner_SKF.split(x_data, y_data):
        x_train = x_data.iloc[train_id,:]
        x_train.index = np.arange(x_train.shape[0])
        y_train = y_data[train_id]
        y_train.index = np.arange(x_train.shape[0])
        x_test = x_data.iloc[test_id,:]
        x_test.index = np.arange(x_test.shape[0])
        y_test = y_data[test_id]
        y_test.index = np.arange(x_test.shape[0])
        x_train_model_out_put = add_model_output(Model, x_train,  y_train, num_cross_val= num_cv_model, add_feature= True)
        if scoring == 'roc_auc':
            Inner_scores = cross_val_score(Model,
                                           x_train_model_out_put,
                                           y_train,
                                           n_jobs= -1, 
                                           cv= Inner_SKF, 
                                           scoring='roc_auc')
            Mean_cv = Inner_scores.mean()
            mean_inner_scores.append(Mean_cv)
            Model.fit(x_train ,y_train)
            x_test['Model_output'] = Model.predict_proba(x_test)
            Model.fit(x_train_model_out_put, y_train)
            Predicted_val = Model.predict_proba(x_test)
            roc_auc = roc_auc_score(Predicted_val, y_test)
            scores.append(roc_auc)
        elif scoring == 'r2':
            Inner_scores = cross_val_score(Model,
                                           x_train_model_out_put,
                                           y_train,
                                           n_jobs= -1, 
                                           cv= Inner_SKF, 
                                           scoring='r2')
            Mean_cv = Inner_scores.mean()
            mean_inner_scores.append(Mean_cv)
            Model.fit(x_train ,y_train)
            x_test['Model_output'] = Model.predict(x_test)
            Model.fit(x_train_model_out_put, y_train)
            Predicted_val = Model.predict(x_test)
            r_2 = r2_score(Predicted_val, y_test)
            scores.append(r_2)
    return np.array(scores) , np.array(mean_inner_scores)

In [19]:
def power_fetures (Model, x_data, y_data, thresh, power_ , num_cv = 5, add_features = False):
    Inner_SKF = StratifiedKFold(n_splits=num_cv, random_state= 17)
    x_data = x_data.copy()
    init_x_data = x_data.copy()
    int_float_col = []
    base_score = (cross_val_score(Model, init_x_data, y_data, n_jobs= -1, cv = Inner_SKF, scoring = 'r2')).mean()
    for col in x_data.columns:
        unique_values = len(set(x_data[col]))
        if x_data[col].dtype == 'float64' or x_data[col].dtype == 'int64' and unique_values > thresh:
            int_float_col.append(col)
    for power in np.arange(2, power_ + 1):
        x_data_pow = init_x_data[int_float_col] ** power
        x_data_pow.columns = [*map(lambda x: x + ' ' +'in'+' '+str(power)+' '+'power' , x_data_pow.columns)]
        power_features = x_data_pow.columns.tolist()
        for i in np.arange(len(int_float_col)):
            max_score = 0
            for col in power_features:
                Inner_x_data = x_data.copy()
                Inner_x_data[col] = x_data_pow[col]
                score = (cross_val_score(Model, Inner_x_data,
                                         y_data, n_jobs= -1, cv = Inner_SKF, scoring = 'roc_auc')).mean()
                if score > max_score:
                    max_score = score
                    best_feature = col
            if max_score > base_score:
                base_score = max_score
                power_features.remove(best_feature)
                x_data[best_feature] = x_data_pow[best_feature]
            else:
                break
    if add_features:
        return x_data
    else:
        add_features = list(set(x_data.columns) - set(init_x_data.columns))
        return x_data.loc[:,add_features]

In [20]:
def mult_fetures (Model, x_data, y_data, thresh, num_cv = 5, max_features = False, mult_comb = 5, add_features = False):
    def accum_prodict_df (df):
        prodict_df = pd.Series(np.ones(df.shape[0]))
        for col in df.columns:
            prodict_df *= df[col]
        return prodict_df
    Inner_SKF = StratifiedKFold(n_splits=num_cv, random_state= 17)
    x_data = x_data.copy()
    init_x_data = x_data.copy()
    int_float_col = []
    base_score = (cross_val_score(Model, init_x_data, y_data, n_jobs= -1, cv = Inner_SKF, scoring = 'r2')).mean()
    for col in x_data.columns:
        unique_values = len(set(x_data[col]))
        if x_data[col].dtype == 'float64' or x_data[col].dtype == 'int64' and unique_values > thresh:
            int_float_col.append(col)
    if max_features:
        mult_comb = len(int_float_col)
    for num_comb in np.arange(2, mult_comb + 1):
        feature_combinations = [*itertools.combinations(int_float_col, num_comb)]
        x_data_comb = pd.DataFrame()
        for comb in feature_combinations:
            df_feature_mult = init_x_data.loc[:, list(comb)]
            df__multiplied = accum_prodict_df(df_feature_mult)
            x_data_comb['PRODUCT OF'+ ' ('+', '.join(df_feature_mult.columns)+')'] = df__multiplied
        comb_features = x_data_comb.columns.tolist()
        for i in np.arange(len(int_float_col)):
            max_score = 0
            for col in comb_features:
                Inner_x_data = x_data.copy()
                Inner_x_data[col] = x_data_comb[col]
                score = (cross_val_score(Model, Inner_x_data,
                                         y_data, n_jobs= -1, cv = Inner_SKF, scoring = 'roc_auc')).mean()
                if score > max_score:
                    max_score = score
                    best_feature = col
            if max_score > base_score:
                base_score = max_score
                comb_features.remove(best_feature)
                x_data[best_feature] = x_data_comb[best_feature]
            else:
                break
    if add_features:
        return x_data
    else:
        add_features = list(set(x_data.columns) - set(init_x_data.columns))
        return x_data.loc[:,add_features]

In [21]:
def sort_features_by_Lasso (x_data, y_data):
    Lasso_reg = Lasso()
    Lasso_reg.fit(x_data, y_data)
    num_col = []
    for i, b in enumerate(Lasso_reg.coef_):
        if b != 0:
            num_col.append(i)
    Lasso_df = x_data.iloc[:,num_col]
    return Lasso_df
    