Загрузим библиотеки

In [133]:
import pandas as pd 

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score

# Предобработка данных

Загрузим данные

In [134]:
apparel = pd.read_csv('apparel-purchases.csv')
gifts = pd.read_csv('gifts-purchases.csv')
marketplace = pd.read_csv('marketplace-purchases.csv')

Посмотрим на таблицы

In [135]:
apparel.head()

Unnamed: 0,client_id,quantity,price,date,message_id,recommended_by
0,1515915625468068833,1,3499.0,2023-09-08,1515915625468068833-13781-64fad81bece56,bulk_message
1,1515915625468068833,1,3499.0,2023-09-08,1515915625468068833-13781-64fad81bece56,bulk_message
2,1515915625468068833,1,3499.0,2023-09-08,1515915625468068833-13781-64fad81bece56,bulk_message
3,1515915625468068833,1,2450.0,2023-09-08,1515915625468068833-13781-64fad81bece56,bulk_message
4,1515915625468068833,1,2450.0,2023-09-08,1515915625468068833-13781-64fad81bece56,bulk_message


In [136]:
gifts.head()

Unnamed: 0,client_id,quantity,price,date,message_id,recommended_by
0,1515915625803894158,1,8943.0,2023-10-11,1515915625935652010-16166-6525589bca68f,bulk_message
1,1515915625803894158,1,46369.0,2023-10-11,1515915625935652010-16166-6525589bca68f,bulk_message
2,1515915625803894158,1,4666.0,2023-10-11,1515915625935652010-16166-6525589bca68f,bulk_message
3,1515915625803894158,1,11963.0,2023-10-11,1515915625935652010-16166-6525589bca68f,bulk_message
4,1515915625803894158,1,13107.0,2023-10-11,1515915625935652010-16166-6525589bca68f,bulk_message


In [137]:
marketplace.head()

Unnamed: 0,client_id,quantity,price,message_id,created_at,date
0,1515915625440944408,1,6298.0,1515915625440944408-1752-6450b79616277,2023-05-02,2023-05-02 08:49:57
1,1515915625440992498,1,7318.0,1515915625440992498-1818-646b35d089c98,2023-05-22,2023-05-22 13:52:13
2,1515915625441026424,1,65798.0,1515915625441026424-1802-64634e24a2cb9,2023-05-17,2023-05-17 11:21:04
3,1515915625441118180,1,5598.0,1515915625441118180-1820-646c9444ebf39,2023-05-23,2023-05-23 19:02:53
4,1515915625441124500,1,77998.0,1515915625441124500-1825-646ed5cf4d5ed,2023-05-25,2023-05-25 07:46:52


соберем таблицы в одну

In [138]:
apparel_df = apparel[['client_id', 'quantity', 'price', 'date']]
marketplace_df = marketplace[['client_id', 'quantity', 'price', 'date']]
gifts_df = gifts[['client_id', 'quantity', 'price', 'date']]

In [139]:
df = pd.concat([marketplace_df, gifts_df, apparel_df])

Выделим признаки

In [140]:
df['date'] = pd.to_datetime(df['date'])

In [141]:
df['max_date'] = df['date'].max()

In [142]:
df['days_diff'] = (df['max_date'] - df['date']).dt.days

In [143]:
df['recency'] = df.groupby('client_id')['days_diff'].transform('min')

In [144]:
df['monetary'] = df.groupby('client_id')['price'].transform('sum')
df['frequency'] = df.groupby('client_id')['quantity'].transform('sum')

Выделим таргет (клиенты за последние 30 дней)

In [145]:
start_date = df['date'].max() - pd.Timedelta(days=29)

In [146]:
df_30 = df[df['date'] >= start_date]

In [147]:
client_30_list = list(df_30['client_id'].unique())

In [148]:
df = df.groupby('client_id').last().reset_index()

In [149]:
df['target'] = 0

In [150]:
df.loc[df['client_id'].isin(client_30_list), 'target'] = 1

Посмотрим сколько их всего

In [151]:
df['target'].mean()

0.058867330439962555

Сделаем датасет для обучение модели

In [152]:
df = df[['recency', 'monetary', 'frequency', 'target']]

# Построение модели

Сделаем сплит

In [153]:
y = df['target']
X = df.drop(['target'], axis=1)

In [154]:
def split_train_valid_test(X, y, train_rem_size, valid_test_size):

    #split to train and remain
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_rem_size, random_state=123)

    df_data = {'train': {'X': X_train, 'y': y_train},
               'test': {'X': X_test, 'y': y_test}}

    return df_data


In [155]:
data = split_train_valid_test(X, y, 0.8, 0.8)

In [156]:
len(data['train']['X']), len(data['train']['y']), len(data['test']['X']), len(data['test']['y'])

(37603, 37603, 9401, 9401)

Напишем функции для кроссвалидации данных и зафитим LogisticRegression и RandomForestClassifier. Проведем тюнинг

In [176]:
def get_grid_params(
                    depth=        10,
                    est =         200,
                    depth_step=   4,
                    step_est =    10,
                    scoring=      'recall',
                    class_weight= "balanced",
                    njobs=        12,
                    refit=        True):

    pipeline = Pipeline([('clf', LogisticRegression(random_state=123))])
    parameters = [
        {
            'clf': (LogisticRegression(random_state=123),),
            'clf__class_weight': [class_weight]
        }, 

        {
            'clf': (RandomForestClassifier(random_state=123),),
            'clf__n_estimators': range(1, est, step_est),
            'clf__max_depth': range(1, depth, depth_step),
            'clf__class_weight': [class_weight]
        }
    ]

    return GridSearchCV(pipeline, parameters, scoring=scoring, n_jobs=njobs, refit=refit)

In [177]:
def fit_and_results(data,
                    grid_search):

    grid_search.fit(data['train']['X'], data['train']['y'])
    print('Best estimator parameters:')
    print(grid_search.best_params_)
    print('f1 valid score:')
    print(grid_search.best_score_)
    print('recall test score:')
    test_score = recall_score(grid_search.best_estimator_.predict(data['test']['X']), data['test']['y'])
    print(test_score)


Посмотрим результат

In [178]:
grid_search = get_grid_params(depth=10,
                              depth_step=5,
                              est=100,
                              step_est=20,
                              scoring='recall',
                              class_weight="balanced"
                            )

fit_and_results(data, grid_search)

Best estimator parameters:
{'clf': RandomForestClassifier(class_weight='balanced', max_depth=1, n_estimators=1,
                       random_state=123), 'clf__class_weight': 'balanced', 'clf__max_depth': 1, 'clf__n_estimators': 1}
f1 valid score:
1.0
recall test score:
1.0
