# Импортируем нужные пакеты

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas import Timestamp, DateOffset


%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)


# Читаем даные из файлов customers_gender_train.csv, transactions.tsv.gz, mcc_types.tsv и trans_types.tsv.

In [2]:
df_gender = pd.read_csv('data/customers_gender_train.csv')
df_gender.head()

Unnamed: 0,customer_id,gender
0,75562265,0
1,10928546,1
2,69348468,1
3,84816985,1
4,61009479,0


In [4]:
df_transactions = pd.read_csv('data/transactions.csv.gz')
df_transactions.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
0,39026145,0 10:23:26,4814,1030,-2245.92,
1,39026145,1 10:19:29,6011,7010,56147.89,
2,39026145,1 10:20:56,4829,2330,-56147.89,
3,39026145,1 10:39:54,5499,1010,-1392.47,
4,39026145,2 15:33:42,5499,1010,-920.83,


In [31]:
df_tr = pd.read_csv('data/tr_types.csv', sep=';')
df_tr.head()

Unnamed: 0,tr_type,tr_description
0,3200,Плата за предоставление услуг посредством моби...
1,3210,Плата за предоставление отчета по счету карты ...
2,3800,Плата за обслуживание банковской карты (за пер...
3,4000,Плата за получение наличных в Сбербанке
4,4001,Плата за получение наличных в Сбербанке (в дру...


In [6]:
df_mcc = pd.read_csv('data/tr_mcc_codes.csv', sep=';')
df_mcc

Unnamed: 0,mcc_code,mcc_description
0,742,Ветеринарные услуги
1,1711,"Генеральные подрядчики по вентиляции, теплосна..."
2,1731,Подрядчики по электричеству
3,1799,"Подрядчики, специализированная торговля — нигд..."
4,2741,Разнообразные издательства/печатное дело
5,3000,"Авиалинии, авиакомпании"
6,3351,Агентства по аренде автомобилей
7,3501,"Жилье — отели, мотели, курорты"
8,4111,Транспортировка — пригородные и локальные сезо...
9,4112,Пассажирские железные перевозки


# Преобразуем данные даты и суммы транзакций в transactions в понятный формат

In [7]:
def preproc_transactions(df_transactions):
    sec_per_day = 86400
    sec_per_hour = 3600
    
    start_date = 1420070400 - 154 * sec_per_day - 3 * sec_per_hour
    
    df_transactions.loc[:, 'day'] = df_transactions.tr_datetime\
                                               .str.split(' ')\
                                               .str.get(0)\
                                               .astype(int)
    df_transactions.loc[:, 'time_raw'] = df_transactions.tr_datetime\
                                                    .str.split(' ')\
                                                    .str.get(1)

    # set temp dt
    df_transactions.loc[:, 'dt_temp'] = pd.to_datetime(df_transactions.loc[:, 'time_raw'], 
                                                    format='%H:%M:%S')\
                                        + DateOffset(years=115)
    
    df_transactions = df_transactions.assign(dt = lambda x: x.dt_temp.astype(np.int64) // 10**9
                                             + (x.day - 153) * sec_per_day)\
                                     .assign(weekday = lambda x: ((x.day + 4) % 7 + 1))
        
    df_transactions.loc[:, 'datetime'] = pd.to_datetime(df_transactions.dt, unit='s')
    df_transactions.loc[:, 'date'] = df_transactions.loc[:, 'datetime'].dt.strftime('%Y-%m-%d')
    df_transactions.loc[:, 'hour'] = df_transactions.loc[:, 'datetime'].dt.strftime('%H')
    
    df_transactions = df_transactions.drop(['dt_temp', 'time_raw', 'tr_datetime'], axis=1)
    
    df_transactions.loc[:, 'amount'] = np.round(df_transactions.loc[:, 'amount']/(np.pi**np.exp(1)))
            
    return df_transactions

In [26]:
df_transactions = df_transactions.pipe(preproc_transactions)
df_transactions.head()


AttributeError: 'DataFrame' object has no attribute 'tr_datetime'

# Генерим признаки

In [None]:
#попробуем добавить количество тразакций по MCC и сумму транзакций по рабочим и не рабочим дням.  дальше творчество 
# пошло тяжело и  на основе текстов я признак не придумал:(

In [23]:
def gen_features(df_gender, df_transactions):
    
    df_mcc_counts = \
        df_transactions.pivot_table(index=['customer_id'], columns='mcc_code', values='amount', 
                             aggfunc='count', fill_value=0)

    df_mcc_counts = df_mcc_counts.rename(lambda x: 'mcc_{}_sum'.format(x), axis=1)
    
    df_week_period = df_transactions.pivot_table(index=['customer_id'], columns='weekday', values='amount', 
                             aggfunc='sum')
    df_week_period = df_week_period.rename(lambda x: 'Day_{}'.format(x), axis=1)
        
    df_week_period.loc[:, 'Working Days'] = (df_week_period.loc[:, 'Day_1':'Day_5'].sum(axis=1))
    df_week_period.loc[:, 'Hollydays'] = (df_week_period.loc[:, 'Day_6':'Day_7'].sum(axis=1))
  
       
    # Объединяем:
    df_features = df_gender.join(df_mcc_counts, on='customer_id', how='left')\
                            .join(df_week_period.loc[:, ['Working Days','Hollydays']], on='customer_id', how='left')
        
    
    return df_features

In [24]:
df_features = df_gender.pipe(gen_features, df_transactions)

In [25]:
df_features.head()

Unnamed: 0,customer_id,gender,mcc_742_sum,mcc_1711_sum,mcc_1731_sum,mcc_1799_sum,mcc_2741_sum,mcc_3000_sum,mcc_3351_sum,mcc_3501_sum,...,mcc_8641_sum,mcc_8699_sum,mcc_8999_sum,mcc_9211_sum,mcc_9222_sum,mcc_9311_sum,mcc_9399_sum,mcc_9402_sum,Working Days,Hollydays
0,75562265,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,47299.0,-218012.0
1,10928546,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,2,0,-27299663.0,-4045390.0
2,69348468,1,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,-184694.0,-71066.0
3,84816985,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4978.0,977.0
4,61009479,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,33176.0,-47885.0


In [35]:
label = 'gender'
idx_features = df_features.columns != label

In [36]:
X = df_features.loc[:, idx_features].values
y = df_features.loc[:, ~idx_features].values.flatten()

# Делаем пайплайн

In [32]:
# импортируем нужные библиотеки
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression

In [33]:
# делаем пайплайн который нормализует данные и запускает логистическую регрессию
model = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())
])

In [34]:
RND_SEED = 123

# Hyper opt

In [None]:
# Грузим все необходимое

In [45]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, rand

In [47]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [41]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score

In [None]:
#Обновляем X и y

In [42]:
X = df_features.loc[:, idx_features].values
y = df_features.loc[:, ~idx_features].values.flatten()

In [43]:
def run_trials_template(X, y, params, evals=100):

    def hyperopt_cv(X, y, params):
        
        X_ = X.copy()
        
        # Отделяем параметры лог регрессии в отдельный словарь
        lm_params = {}
        for k, v in params.items():
            if k.startswith('glob'):
                continue                
            elif k.startswith('lm'):
                lm_params[k.split('_', 1)[1]] = v
        
        # Задаем шкалирование
        if params['scaler_type'] == 'standart':
            scaler = StandardScaler(with_mean=params['scaler_centering'])
        else:
            assert params['scaler_type'] == 'robust'
            scaler = RobustScaler(with_centering=params['scaler_centering'])
        
        # Создаем лог рег с нужными параметрами
        clf = LogisticRegression(**lm_params)
        
        # Итоговый пайплайн
        model = Pipeline([
            ('scaler', scaler),
            ('clf', clf)
        ])

        # Схема кросс-валидации
        n_splits = 5
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, 
                             random_state=RND_SEED)
        scores = cross_val_score(model, X_, y,
                                 scoring='roc_auc', 
                                 cv=cv, 
                                 n_jobs=-1)

        # Возвращаем среднее значение метрики и отклонение (на всякий случай)
        return scores.mean(), scores.std()

    def f(params):
        acc, std = hyperopt_cv(X, y, params)
        return {'loss': -acc, 'qscore': -acc, 'qscore_std': std, 'status': STATUS_OK}

    trials = Trials()
    best = fmin(f, 
                params, 
                algo=tpe.suggest, 
                max_evals=evals, 
                trials=trials, 
                verbose=1)
    
    return trials

In [44]:
# Задаем пространство поиска
space4_lm = {
    'lm_penalty': hp.choice('penalty', ['l1', 'l2']),
    'lm_C': hp.loguniform('C', -5, 3),
    'lm_class_weight': hp.choice('class_weight', [None, 'balanced']),
    'lm_random_state': RND_SEED,
    'scaler_type': hp.choice('scaler_type', ['standart', 'robust']),
    'scaler_centering': hp.choice('scaler_centering', [False, True])
}

In [48]:
# Запускаем поиск
trials = run_trials_template(X, y, space4_lm, evals=40)

In [49]:
def trials_df(trials):
    '''
    Функция форматирует результаты hyperopt в dataframe
    '''
    tr_dict = []
    for t in trials:
        trial = dict()
        for k, v in t['misc']['vals'].items():
            trial[k] = v[0]

        trial['qscore'] = -t['result']['qscore']
        trial['qscore_std'] = -t['result']['qscore_std']
        tr_dict.append(trial)

    df_res = pd.DataFrame.from_dict(tr_dict)
    df_res = df_res.sort_values('qscore', ascending=False)
    
    return df_res

In [None]:
#Вытаскиваем результаты поиска в датасет

In [52]:
df_trials = trials_df(trials)

In [None]:
#Здесь `qscore` - метрика качесва, а `scaler_type = 1` означает, что был выбран `scaler_type = robust`

In [51]:
df_trials.head()

Unnamed: 0,C,class_weight,penalty,qscore,qscore_std,scaler_centering,scaler_type
7,0.029941,0,0,0.841355,-0.005362,0,1
37,0.027183,0,0,0.841344,-0.005475,0,1
35,0.032675,0,0,0.841281,-0.005185,0,1
29,0.037326,0,0,0.841055,-0.004987,0,1
30,0.03754,0,0,0.841044,-0.004982,0,1
