In [2]:
import logging
import os
import dill
import pandas as pd
import hashlib

from datetime import datetime
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


path = os.environ.get('PROJECT_PATH', 'E:/final/final')


def filter_data(df: pd.DataFrame) -> pd.DataFrame:
    columns_to_drop = [
        'session_id',
        'client_id',
    ]
    return df.drop(columns_to_drop, axis=1)


def create_features(df: pd.DataFrame) -> pd.DataFrame:

    df = df.copy()
    df['width'] = pd.to_numeric(df['device_screen_resolution'].str.split('x').str[0])
    df['height'] = pd.to_numeric(df['device_screen_resolution'].str.split('x').str[1])

    df['datetime'] = pd.to_datetime(df['visit_date'], format='%Y-%m-%d') + \
                     pd.to_timedelta(df['visit_time'])

    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute

    df['geo'] = df['geo_country'] + '/' + df['geo_city']

    drop_col = ['device_screen_resolution', 'visit_date', 'visit_time', 'datetime', 'geo_country', 'geo_city']

    return df.drop(drop_col, axis=1)


# def hash_features(df, col):
#     df = df.copy()
#     df[col] = df[col].applymap(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest())
#     return pd.DataFrame(df)


def pipeline() -> None:
    df = pd.read_csv(f'{path}/final_job/final_data', low_memory=False)

    X = df.drop('target', axis=1)
    y = df['target']

    scal_columns = ['visit_number', 'width', 'height', 'year', 'month', 'day', 'hour', 'minute']
    encod_columns = ['utm_medium', 'device_category', 'device_os', 'device_brand', 'device_browser', 'geo']
    # hash_columns = ['utm_source', 'utm_campaign', 'utm_adcontent', 'utm_keyword', 'device_model']

    scal_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    encod_transformer = Pipeline(steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    column_transformer = ColumnTransformer(transformers=[
        ('numerical', scal_transformer, scal_columns),
        ('categorical', encod_transformer, encod_columns)
    ])

    preprocessor = Pipeline(steps=[
        ('filter', FunctionTransformer(filter_data)),
        ('feature_creator', FunctionTransformer(create_features)),
        # ('hash_transformer', FunctionTransformer(hash_features(X, hash_columns))),
        ('column_transformer', column_transformer)
    ])

    models = [
        LogisticRegression(),
        RandomForestClassifier()
    ]
    best_score = .0
    best_pipe = None
    for model in models:

        pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

        score = cross_val_score(pipe, X, y, cv=4, scoring='accuracy')
        logging.info(f'model: {type(model).__name__}, acc_mean: {score.mean():.4f}, acc_std: {score.std():.4f}')
        if score.mean() > best_score:
            best_score = score.mean()
            best_pipe = pipe

    logging.info(f'best model: {type(best_pipe.named_steps["classifier"]).__name__}, accuracy: {best_score:.4f}')

    best_pipe.fit(X, y)
    model_filename = f'{path}/sber_auto_pipe.pkl'

    metadata = {
        'name': 'Sber auto-subscription service prediction model',
        'author': 'Akty',
        'version': 1,
        'date': datetime.now(),
        'type': type(best_pipe.named_steps["classifier"]).__name__,
        'accuracy': best_score
    }

    with open(model_filename, 'wb') as file:
        dill.dump({'model': best_pipe, 'metadata': metadata}, file, recurse=True)

    logging.info(f'Model is saved as {model_filename}')


if __name__ == '__main__':
    pipeline()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KeyboardInterrupt: 