## Import ##

In [612]:
import numpy as np
import pandas as pd
import os
import ray
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import sys
import warnings
warnings.filterwarnings('ignore')

<div class="alert alert-block alert-success">
<b>Comment:</b> This is first version of random feature generator via function programming.

After some bugfixes it will be refactored and incapsulate via class.
</div>

## Test data - Titanic from Kaggle ##

https://www.kaggle.com/competitions/titanic/

<div class="alert alert-block alert-warning">
<b></b> We will use only numeric columns only for testing.

TO DO - cat features.
</div>



In [880]:
df = pd.read_csv('Titanic/train.csv')

In [881]:
df.rename(columns={'Survived': 'y'}, inplace=True) # label = y

In [882]:
df = df[['Age','Fare','PassengerId','Pclass','y' ]] # numeric

In [883]:
df.dropna(inplace = True) # leave filling na values

In [884]:
df

Unnamed: 0,Age,Fare,PassengerId,Pclass,y
0,22.0,7.2500,1,3,0
1,38.0,71.2833,2,1,1
2,26.0,7.9250,3,3,1
3,35.0,53.1000,4,1,1
4,35.0,8.0500,5,3,0
...,...,...,...,...,...
885,39.0,29.1250,886,3,0
886,27.0,13.0000,887,2,0
887,19.0,30.0000,888,1,1
889,26.0,30.0000,890,1,1


Let's set seed

In [577]:
def set_seed(seed: int = 1):
    np.random.seed(seed = seed)

## Fuctions ##

In [867]:
#@ray.remote
def fit(df: pd.DataFrame, algorithm, metric, metric_highter_better: bool = True, 
        use_subsample: int = 2, feature_combiner: bool = False, EPOCHS: int = 1, 
        iterations: int = 10, replace_cols: bool = True) -> tuple:
    
    """ 
    Briefly, function generalize calculations from feature_creator and choose best transformations.
    Has same parameters with feature_creator function.
    
    Params:
    df: pd.DataFrame - initial data frame with feature columns and label columns with \'y\' name, 
    algorithm - any object, which has _.fit() and _.predict() methods, which take X, y,  
    metric - any object (or operator), which spanned (y_pred, y_true) form to float number  $L: (y,y) -> R$, 
    metric_highter_better: bool = True - flag, which show increasing or decresing character of metric 
    (for example - mse - highter -> worse, accuracy - highter -> better), 
    use_subsample: int = 2 - num of features in one subsample for creating feature - use N cols for metric calculation, 
    feature_combiner: bool = False - combine feature in one subsample (in use_subsample) TO DO, 
    EPOCHS: int = 1 - amount of epochs, 
    iterations: int = 10 - amount of iteration for feature generation for one subsample (in connection 
    with replace_cols = False - define amount of new features (if it will be better via metric) ), 
    replace_cols: bool = True - replace initial feature or create new and concatenate ones via pd.concat(axis = 1)
    
    Return:
    
    tuple:
        best_res_df - data frame with best features, 
        all_best_score - best score after all epochs, 
        all_best_tranformes - best tranformes after all epochs, 
        scores - list of scores
    
    """
    
    df = df.copy()
    y = df['y']
    df = df.drop('y', axis = 1)
    df_shape = df.shape[1]
    
    if (metric_highter_better):
        all_best_score = 0
    else:
        all_best_score = 1e10
        
    for epoch in range(0, EPOCHS):
        res_df = pd.DataFrame([])
        set_cols = set(range(0, df_shape))

        while (len(set_cols) != 0):
            #print(set_cols)
            num_cols = np.random.choice(a = list(set_cols), replace = False, size = use_subsample)
            #print(num_cols)

            current_subsample, best_score, best_tranformes, scores = feature_creator(subsample = df[df.columns[num_cols]].copy(), 
                                                      y_col = y, 
                                                      algorithm_object = algorithm, 
                                                      metric = metric, 
                                                      metric_highter_better = True, 
                                                      iterations = iterations, 
                                                      replace_cols = replace_cols)

            res_df = pd.concat([res_df, current_subsample], axis = 1)

            for set_elem in num_cols:
                set_cols.remove(set_elem)
        
        if (metric_highter_better):
            if (best_score > all_best_score):
                # leave changes and continue
                all_best_score = best_score
                #best_tranformes.append((choice_transform, subsample.columns[choice_column], best_score))
                best_res_df = res_df.copy()
                all_best_tranformes = best_tranformes
        else:
            if (best_score < all_best_score):
                # leave changes and continue
                all_best_score = best_score
                #best_tranformes.append((choice_transform, subsample.columns[choice_column], best_score))
                best_res_df = res_df.copy()
                all_best_tranformes = best_tranformes
       
        
        
    return best_res_df, all_best_score, all_best_tranformes, scores

In [878]:
def feature_creator(subsample: pd.DataFrame, y_col: np.array, 
                    algorithm_object: object, metric: object, metric_highter_better: bool = True, 
                    iterations: int = 10, seed: int = 1, replace_cols: bool = True) -> tuple:
    
    """
    Single feature creator, which conduct iteration of sampling new transformations.
    
    Params:
    subsample: pd.DataFrame - some cols from general dataset, which borrowed from fit-fuction and has shape = use_subsample, 
    y_col: np.array or pd.Series - label column, 
    algorithm_object - any object, which has _.fit() and _.predict() methods, which take X, y,  
    metric - any object (or operator), which spanned (y_pred, y_true) form to float number  $L: (y,y) -> R$, 
    metric_highter_better: bool = True - flag, which show increasing or decresing character of metric 
    (for example - mse - highter -> worse, accuracy - highter -> better), 
    use_subsample: int = 2 - num of features in one subsample for creating feature - use N cols for metric calculation, 
    iterations: int = 10 - amount of iteration for feature generation for one subsample (in connection 
    with replace_cols = False - define amount of new features (if it will be better via metric) ), 
    replace_cols: bool = True - replace initial feature or create new and concatenate ones via pd.concat(axis = 1).
    
    """
    
    # some data transformers
    def deg2(x):
        return x**2
    def deg3(x):
        return x**3
    def ln(x):
        if (x == 0):
            x = x + 1e-5
        return np.log(x)
    def log2(x):
        if (x == 0):
            x = x + 1e-5
        return np.log2(x)
    def log10(x):
        if (x == 0):
            x = x + 1e-5
        return np.log10(x)
    def filtered_exp(x):
        if np.isinf(np.exp(x)):
            # sys.float_info.max doesn't work and we don't need this
            return 10000000
        else:
            return x
    def add_dg2(x):
        return x + x**2
    def add_dg3(x):
        return x + x**3
    def add_dg2_dg3(x):
        return x + x**2 + x**3
    
    type_of_transform = {'exp': filtered_exp, 
                         'ln': ln, 'log2': log2, 'log10': log10, 
                         'deg2':deg2, 'deg3':deg3, 'add_dg2': add_dg2,
                         'add_dg3': add_dg3,'add_dg2_dg3':add_dg2_dg3,
                         'sin': np.sin, 'cos': np.cos, }
    
    if (metric_highter_better):
        best_score = 0
    else:
        best_score = 1e10
    
    init_subsample = subsample.copy()
    best_subsample = pd.DataFrame([])
    scores = dict()
    best_tranformes = []#dict()
    
    
    
    for current_iter in range(0, iterations):
        choice_transform = np.random.choice(a = list(type_of_transform.keys()))
        choice_column = np.random.choice(a = list(range(0, subsample.shape[1])) )
        
        if (replace_cols):
            subsample[subsample.columns[choice_column]] = subsample[subsample.columns[choice_column]].apply(type_of_transform[choice_transform])
        else:
            subsample[subsample.columns[choice_column]+'_{}'.format(current_iter)] = subsample[subsample.columns[choice_column]].apply(type_of_transform[choice_transform])
        
        #print(choice_transform)
        subsample = subsample.replace([np.inf, -np.inf], np.nan)
        # log can create 
        #print(subsample.isna().any().any())
        if subsample.isna().any().any():
            subsample = subsample.fillna(value = 0)
        #    subsample = pd.concat([subsample, y_col], axis = 1)
            
        X_train, X_test, y_train, y_test = train_test_split(subsample, 
                                                            y_col, 
                                                            test_size=0.33, 
                                                            shuffle=False, 
                                                            random_state=seed)
        alg_instance = algorithm_object()
        alg_instance.fit(X = X_train, y = y_train)
        
        y_pred = alg_instance.predict(X_test)
        score = metric(y_test, y_pred)
        #print(score)
        if (metric_highter_better):
            if (score > best_score):
                # leave changes and continue
                best_score = score
                best_tranformes.append((choice_transform, subsample.columns[choice_column], best_score))
            else:
                # return transform
                if (replace_cols):
                    subsample[subsample.columns[choice_column]] = init_subsample[subsample.columns[choice_column]]
                else:
                    subsample = subsample.drop(subsample.columns[choice_column]+'_{}'.format(current_iter), axis = 1)

                
        else:
            if (score < best_score):
                # leave changes and continue
                best_score = score
                best_tranformes.append((choice_transform, subsample.columns[choice_column], best_score))
            else:
                # return transform
                if (replace_cols):
                    subsample[subsample.columns[choice_column]] = init_subsample[subsample.columns[choice_column]]
                else:
                    subsample = subsample.drop(subsample.columns[choice_column]+'_{}'.format(current_iter), axis = 1)

                
                
        scores[choice_transform] = score
    
    return subsample, best_score, best_tranformes, scores

In [879]:
res_df, bs, bt, sc = fit(df = df, algorithm = LogisticRegression, metric = accuracy_score, metric_highter_better = True, 
        use_subsample = 2, replace_cols = False, iterations = 3, EPOCHS = 3)
#print(res_df.shape[1])
#print(bs)
X_train, X_test, y_train, y_test = train_test_split(res_df, df['y'], test_size=0.33, random_state=1)
algorithm = LogisticRegression()
algorithm.fit(X = X_train, y = y_train)
y_pred = algorithm.predict(X_test)
score = accuracy_score(y_test, y_pred)
bs, bt, sc

(0.7330508474576272,
 [('deg2', 'Pclass', 0.7330508474576272)],
 {'add_dg2': 0.635593220338983, 'sin': 0.690677966101695})

Исходный алгоритм на исходных данных и новый

In [871]:
def compare(df, alg, metric):
    """
    Let's compare initial dataset and new transformed dataset using same methods from scikit-learn.
    Return tuple:
    (score_1 - initial metric value, score_2 - new metric value)
    """
    X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis = 1), df['y'], test_size=0.33, random_state=1)
    alg_instance = alg()
    alg_instance.fit(X = X_train, y = y_train)
    y_pred = alg_instance.predict(X_test)
    score_1 = metric(y_test, y_pred)
    
    res_df, bs, bt, sc = fit(df = df, algorithm = alg, metric = metric, metric_highter_better = True, 
        use_subsample = 2, replace_cols = False, iterations = 3, EPOCHS = 15)
    #print(res_df.shape[1])
    X_train, X_test, y_train, y_test = train_test_split(res_df, df['y'], test_size=0.33, random_state=1)
    alg_instance = alg()
    alg_instance.fit(X = X_train, y = y_train)
    y_pred = alg_instance.predict(X_test)
    score_2 = metric(y_test, y_pred)
    
    print(score_1, score_2)

In [872]:
compare(df, LogisticRegression, accuracy_score)

0.7288135593220338 0.7033898305084746


In [873]:
compare(df, KNeighborsClassifier, accuracy_score)

0.6059322033898306 0.6949152542372882


In [874]:
compare(df, RandomForestClassifier, accuracy_score)

0.7161016949152542 0.7372881355932204
