# Example usage

To use `instrumentum` in a project:

In [5]:
import instrumentum

print(instrumentum.__version__)

0.2.0


In [2]:
import instrumentum.time_series as ts
import pandas as pd

data_file = "./sample_data/time_series.csv"
data_df = pd.read_csv(data_file)

ts.ts_print(data_df, col_target="ACQUIRED", col_key="CUSTOMER_KEY")

Total unique keys that with a possitive value:  2
Total records of ckeys that with a possitive value:  13
Total unique keys with no possitive value:  2
Total records of customers with no possitive values:  10


In [4]:
import instrumentum.features.stepwise as sw
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import logging

from instrumentum.utils.scorer_wrappers import optimizer_optuna_rf as rf, optimizer_optuna_xgb as xgb_

data_file = "./sample_data/simple.csv"
data_df = pd.read_csv(data_file)

data_df.loc[data_df['target'] == -1, 'target'] = 0

def myfunc2(X_train, y_train):
    ret, _ = xgb_(X_train, y_train,n_trials=12, n_repeats=1, verbose=False)
    return ret


def myfunc(X_train, y_train):

    model = DecisionTreeClassifier(random_state=0)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
    return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv).mean()

#logging.getLogger("")
ret = sw.forward_stepwise(data_df.drop("target",axis=1)[['a', 'b', 'c', 'd', 'e', 'f']], data_df['target'], n_combs=2, rounding=4, add_always=True, n_jobs=1, verbose=logging.WARNING)


In [68]:
import pandas as pd
import numpy as np
from instrumentum.feature_preprocess import to_numeric
from sklearn.impute import SimpleImputer
import logging
data_file = "/Users/federico/codes/instrumentum/docs/sample_data/simple.csv"
data_df = pd.read_csv(data_file)

raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'], 
        'last_name': [pd.Timestamp('20180310'), pd.Timestamp('20180310'), pd.Timestamp('20180310'), pd.Timestamp('20180310'), pd.Timestamp('20180310')], 
        'age': [22, 2, 2, 24, 25], 
        'sex': ['m', np.nan, 'f', 'm', 'f'], 
        'Test1_Score': [False, True, "la", True, True],
        'Test2_Score': [25, np.nan, 3, 25, 0]}
data_df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'Test1_Score', 'Test2_Score'])


logger = logging.getLogger(__name__)

def _rescue_numeric_missing_data(df, col, scorer, col_y):

    print(df)
    

    check_significance = scorer is not None and col_y is not None

    strategies = {}

    col_flag_nans = df[col].notnull().astype('int')

    strategies['mean'] = pd.concat({col + "__imp_mean": df[col].fillna(df[col].mean()), \
        col + "__was_nan" : col_flag_nans},axis=1)

    strategies['median'] = pd.concat({col + "__imp_median": df[col].fillna(df[col].mean()), \
        col + "__was_nan" : col_flag_nans},axis=1)
    
    #pd.concat([df[col].fillna(df[col].mean()), col_flag_nans], axis=1)#, [col + "_imp_mean", col + "was_nan"])
    #strategies['median'] = pd.DataFrame([df[col].fillna(df[col].median()), col_flag_nans], [col + "_imp_mean", col + "was_nan"])
    # df['mean'] =  df[col].fillna(df[col].mean())
    # df['median'] =  df[col].fillna(df[col].median())
    # imputer = SimpleImputer(strategy='mean')
    # df['mean'] = imputer.fit_transform(df[col].values.reshape(-1,1))[:,0]

    # imputer = SimpleImputer(strategy='median')
    # df['median'] = imputer.fit_transform(df[col].values.reshape(-1,1))[:,0]

    # imputer = SimpleImputer(strategy='constant')
    # df['constant'] = imputer.fit_transform(df[col].values.reshape(-1,1))[:,0]

    # pd.concat([df1, df4.reindex(df1.index)], axis=1)
    # df['mean'] = df[col].isnull().astype('int')
    # print(df)


    #print(strategies['mean'])

    #print(strategies['mean'])
    print(df.join(strategies['mean'].reindex(df.index)))


def trasform_to_numeric(df, verbose=logging.INFO, scorer=None, col_y=None):
    
    # if bool(scorer) == bool(col_y):
    #     raise 

    MISS_TOO_MUCH = 0.98
    logger.setLevel(verbose)
    
    removed_cols = []

    for col in df.columns:
        logger.info("Processing: %s", col)
        
        is_numeric = pd.api.types.is_numeric_dtype(df[col])
        perc_miss = df[col].isnull().mean() 

        if(is_numeric and perc_miss==0):
            logger.info("-- Numeric and not missing data. Keeping as is")
            continue

        if(perc_miss>MISS_TOO_MUCH):
            logger.info("-- Missing %s of the data (more than threshold of %s). Removing\n", perc_miss, MISS_TOO_MUCH)
            removed_cols += col
            continue

        if(is_numeric): # Only here if misses data
            _rescue_numeric_missing_data(df, col, scorer, col)
            continue

        logger.info(is_numeric)
        logger.info(perc_miss)

trasform_to_numeric(data_df)

21-10-24 23:20 | INFO | Processing: first_name
21-10-24 23:20 | INFO | False
21-10-24 23:20 | INFO | 0.2
21-10-24 23:20 | INFO | Processing: last_name
21-10-24 23:20 | INFO | False
21-10-24 23:20 | INFO | 0.0
21-10-24 23:20 | INFO | Processing: age
21-10-24 23:20 | INFO | -- Numeric and not missing data. Keeping as is
21-10-24 23:20 | INFO | Processing: sex
21-10-24 23:20 | INFO | False
21-10-24 23:20 | INFO | 0.2
21-10-24 23:20 | INFO | Processing: Test1_Score
21-10-24 23:20 | INFO | False
21-10-24 23:20 | INFO | 0.0
21-10-24 23:20 | INFO | Processing: Test2_Score
  first_name  last_name  age  sex Test1_Score  Test2_Score
0      Jason 2018-03-10   22    m       False         25.0
1        NaN 2018-03-10    2  NaN        True          NaN
2       Tina 2018-03-10    2    f          la          3.0
3       Jake 2018-03-10   24    m        True         25.0
4        Amy 2018-03-10   25    f        True          0.0
  first_name  last_name  age  sex Test1_Score  Test2_Score  \
0      Jason

In [22]:
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from instrumentum.scorers.optuna_wrappers import wrapper_opt_lgbm
import pandas as pd

data_file = "/Users/federico/codes/instrumentum/docs/sample_data/simple.csv"
data_df = pd.read_csv(data_file)
data_df['target'] = data_df['target'].replace([-1],0)

X = data_df.drop("target",axis=1) 
y = data_df['target']

#CatBoostClassifier