In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Часть 1. Загрузка и предобработка данных
data = pd.read_excel('OnlineRetail.xlsx')
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


- InvoiceNo: Номер счета. номинальный, 6-значный интегральный номер, однозначно назначенный каждой транзакции. Если этот код начинается с буквы «с», он указывает на отмену.
- StockCode: Продукт ( item ) код. номинальный, 5-значный интегральный номер, однозначно присвоенный каждому отдельному продукту.
- Description: Продукт ( item ) name. номинальный.
- Quantity: Количество каждого продукта ( item ) на транзакцию. числовой.
- InvoiceDate: Счет Дата и время. числовой, день и время, когда была сгенерирована каждая транзакция.
- UnitPrice: Цена за единицу. числовой, Цена продукта за единицу в фунтах стерлингов.
- CustomerID: Номер клиента. номинальный, 5-значный интегральный номер, однозначно присвоенный каждому клиенту.
- Country: Название страны. номинальный, название страны, в которой проживает каждый клиент.

In [3]:
data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [4]:
data.dropna(inplace=True)

In [5]:
data.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,406829.0,406829.0,406829.0
mean,12.061303,3.460471,15287.69057
std,248.69337,69.315162,1713.600303
min,-80995.0,0.0,12346.0
25%,2.0,1.25,13953.0
50%,5.0,1.95,15152.0
75%,12.0,3.75,16791.0
max,80995.0,38970.0,18287.0


In [6]:
pd.DataFrame(data['StockCode'].value_counts())

Unnamed: 0,StockCode
85123A,2077
22423,1905
85099B,1662
84879,1418
47566,1416
...,...
84620,1
90038B,1
84551,1
84614A,1


In [7]:
data.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [8]:
for col in data.columns:
    print(col,data[col].nunique())

InvoiceNo 22190
StockCode 3684
Description 3896
Quantity 436
InvoiceDate 20460
UnitPrice 620
CustomerID 4372
Country 37


In [9]:
data['InvoiceNo'].astype(str).str.startswith('C').value_counts()

False    397924
True       8905
Name: InvoiceNo, dtype: int64

In [11]:
canselleration_df = data[['CustomerID','InvoiceNo']][data['InvoiceNo'].astype(str).str.startswith('C')].groupby('CustomerID',as_index=False).count()
canselleration_df.rename(columns={'InvoiceNo': 'has_cancellation'},inplace=True)
canselleration_df

Unnamed: 0,CustomerID,has_cancellation
0,12346.0,1
1,12352.0,10
2,12359.0,6
3,12362.0,8
4,12365.0,1
...,...,...
1584,18272.0,4
1585,18274.0,11
1586,18276.0,2
1587,18277.0,1


In [12]:
data = pd.merge(data, canselleration_df[['CustomerID','has_cancellation']], on='CustomerID', how='outer')
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,has_cancellation
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,15.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0
...,...,...,...,...,...,...,...,...,...
406824,581578,22993,SET OF 4 PANTRY JELLY MOULDS,12,2011-12-09 12:16:00,1.25,12713.0,Germany,
406825,581578,22907,PACK OF 20 NAPKINS PANTRY DESIGN,12,2011-12-09 12:16:00,0.85,12713.0,Germany,
406826,581578,22908,PACK OF 20 NAPKINS RED APPLES,12,2011-12-09 12:16:00,0.85,12713.0,Germany,
406827,581578,23215,JINGLE BELL HEART ANTIQUE SILVER,12,2011-12-09 12:16:00,2.08,12713.0,Germany,


In [13]:
data = data.fillna(0)

In [14]:
current_date = pd.to_datetime(max(data['InvoiceDate']))
print(current_date)

2011-12-09 12:50:00


In [15]:
type(current_date)

pandas._libs.tslibs.timestamps.Timestamp

In [16]:
last_buy = data[['CustomerID','InvoiceDate']].groupby('CustomerID',as_index=False).agg({'InvoiceDate':'max'})
last_buy

Unnamed: 0,CustomerID,InvoiceDate
0,12346.0,2011-01-18 10:17:00
1,12347.0,2011-12-07 15:52:00
2,12348.0,2011-09-25 13:13:00
3,12349.0,2011-11-21 09:51:00
4,12350.0,2011-02-02 16:01:00
...,...,...
4367,18280.0,2011-03-07 09:52:00
4368,18281.0,2011-06-12 10:53:00
4369,18282.0,2011-12-02 11:43:00
4370,18283.0,2011-12-06 12:02:00


In [17]:
last_buy['Days_after_last_buy'] = (current_date - last_buy['InvoiceDate']).dt.days
last_buy

Unnamed: 0,CustomerID,InvoiceDate,Days_after_last_buy
0,12346.0,2011-01-18 10:17:00,325
1,12347.0,2011-12-07 15:52:00,1
2,12348.0,2011-09-25 13:13:00,74
3,12349.0,2011-11-21 09:51:00,18
4,12350.0,2011-02-02 16:01:00,309
...,...,...,...
4367,18280.0,2011-03-07 09:52:00,277
4368,18281.0,2011-06-12 10:53:00,180
4369,18282.0,2011-12-02 11:43:00,7
4370,18283.0,2011-12-06 12:02:00,3


In [18]:
last_buy['left'] = 0
last_buy.loc[last_buy['Days_after_last_buy'] > 90, 'left'] = 1
last_buy

Unnamed: 0,CustomerID,InvoiceDate,Days_after_last_buy,left
0,12346.0,2011-01-18 10:17:00,325,1
1,12347.0,2011-12-07 15:52:00,1,0
2,12348.0,2011-09-25 13:13:00,74,0
3,12349.0,2011-11-21 09:51:00,18,0
4,12350.0,2011-02-02 16:01:00,309,1
...,...,...,...,...
4367,18280.0,2011-03-07 09:52:00,277,1
4368,18281.0,2011-06-12 10:53:00,180,1
4369,18282.0,2011-12-02 11:43:00,7,0
4370,18283.0,2011-12-06 12:02:00,3,0


In [44]:
last_buy[last_buy["left"]>0]

Unnamed: 0,CustomerID,InvoiceDate,Days_after_last_buy,left
0,12346.0,2011-01-18 10:17:00,325,1
4,12350.0,2011-02-02 16:01:00,309,1
6,12353.0,2011-05-19 17:47:00,203,1
7,12354.0,2011-04-21 13:11:00,231,1
8,12355.0,2011-05-09 13:49:00,213,1
...,...,...,...,...
4355,18262.0,2011-07-22 16:04:00,139,1
4358,18268.0,2011-07-28 19:13:00,133,1
4359,18269.0,2010-12-16 15:39:00,357,1
4367,18280.0,2011-03-07 09:52:00,277,1


In [20]:
data['left']=0
data.loc[data['CustomerID'].isin(last_buy[last_buy["left"]==1]['CustomerID']), 'left'] = 1

In [45]:
data[(data['has_cancellation'] > 0) & (data['left'] == 1)]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,has_cancellation,left
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.0,1
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,15.0,1
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
...,...,...,...,...,...,...,...,...,...,...
355219,563933,22138,BAKING SET 9 PIECE RETROSPOT,2,2011-08-21 15:54:00,4.95,16536.0,United Kingdom,1.0,1
355220,563933,21784,SHOE SHINE BOX,2,2011-08-21 15:54:00,9.95,16536.0,United Kingdom,1.0,1
355221,563933,21880,RED RETROSPOT TAPE,24,2011-08-21 15:54:00,0.65,16536.0,United Kingdom,1.0,1
355222,563933,84974,S/2 ZINC HEART DESIGN PLANTERS,1,2011-08-21 15:54:00,9.95,16536.0,United Kingdom,1.0,1


In [22]:
data['left'].value_counts()

0    358694
1     48135
Name: left, dtype: int64

In [23]:
48135/358694

0.13419516356560188

In [24]:
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,has_cancellation,left
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.0,1
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,15.0,1
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
...,...,...,...,...,...,...,...,...,...,...
406824,581578,22993,SET OF 4 PANTRY JELLY MOULDS,12,2011-12-09 12:16:00,1.25,12713.0,Germany,0.0,0
406825,581578,22907,PACK OF 20 NAPKINS PANTRY DESIGN,12,2011-12-09 12:16:00,0.85,12713.0,Germany,0.0,0
406826,581578,22908,PACK OF 20 NAPKINS RED APPLES,12,2011-12-09 12:16:00,0.85,12713.0,Germany,0.0,0
406827,581578,23215,JINGLE BELL HEART ANTIQUE SILVER,12,2011-12-09 12:16:00,2.08,12713.0,Germany,0.0,0


In [25]:
filtered_data = data[['CustomerID', 'InvoiceNo']].groupby('CustomerID', as_index=False).count()
filtered_data = filtered_data[filtered_data['InvoiceNo'] < 10]
data_new = data[~data['CustomerID'].isin(filtered_data['CustomerID'])]
data_new

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,has_cancellation,left
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.0,1
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,15.0,1
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
...,...,...,...,...,...,...,...,...,...,...
406824,581578,22993,SET OF 4 PANTRY JELLY MOULDS,12,2011-12-09 12:16:00,1.25,12713.0,Germany,0.0,0
406825,581578,22907,PACK OF 20 NAPKINS PANTRY DESIGN,12,2011-12-09 12:16:00,0.85,12713.0,Germany,0.0,0
406826,581578,22908,PACK OF 20 NAPKINS RED APPLES,12,2011-12-09 12:16:00,0.85,12713.0,Germany,0.0,0
406827,581578,23215,JINGLE BELL HEART ANTIQUE SILVER,12,2011-12-09 12:16:00,2.08,12713.0,Germany,0.0,0


In [26]:
filtered_data = data['Description'].value_counts().reset_index()
filtered_data.columns = ['Description', 'Count']
filtered_data = filtered_data[filtered_data['Count'] < 50]
data_new = data[~data['Description'].isin(filtered_data['Description'])]
data_new

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,has_cancellation,left
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.0,1
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,15.0,1
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,15.0,1
...,...,...,...,...,...,...,...,...,...,...
406824,581578,22993,SET OF 4 PANTRY JELLY MOULDS,12,2011-12-09 12:16:00,1.25,12713.0,Germany,0.0,0
406825,581578,22907,PACK OF 20 NAPKINS PANTRY DESIGN,12,2011-12-09 12:16:00,0.85,12713.0,Germany,0.0,0
406826,581578,22908,PACK OF 20 NAPKINS RED APPLES,12,2011-12-09 12:16:00,0.85,12713.0,Germany,0.0,0
406827,581578,23215,JINGLE BELL HEART ANTIQUE SILVER,12,2011-12-09 12:16:00,2.08,12713.0,Germany,0.0,0


In [27]:
data['left'].value_counts()

0    358694
1     48135
Name: left, dtype: int64

In [28]:
data_new['left'].value_counts()

0    331224
1     43240
Name: left, dtype: int64

In [29]:
filtered_data

Unnamed: 0,Description,Count
1865,IVORY LOVE BIRD CANDLE,49
1866,DECORATION PINK CHICK MAGIC GARDEN,49
1867,SET OF 4 FAIRY CAKE PLACEMATS,49
1868,ZINC FINISH 15CM PLANTER POTS,49
1869,RED POLKADOT COFFEE MUG,49
...,...,...
3891,BAKING MOULD CUPCAKE CHOCOLATE,1
3892,BAKING MOULD TOFFEE CUP CHOCOLATE,1
3893,BLACK GLASS BRACELET W HEART CHARMS,1
3894,PINK CHRYSANTHEMUMS ART FLOWER,1


In [30]:
X = data_new.drop('left',axis=1)
y = data_new['left']

rus = RandomUnderSampler(random_state=42)

X_undersampled, y_undersampled = rus.fit_resample(X, y)


In [31]:
X,y = X_undersampled, y_undersampled

In [46]:
X

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,has_cancellation
0,548701,84536A,ENGLISH ROSE NOTEBOOK A7 SIZE,1.0,2011-04-03 11:36:00,0.42,17629,United Kingdom,1
1,554999,82484,WOOD BLACK BOARD ANT WHITE FINISH,1.0,2011-05-29 15:44:00,7.95,15547,United Kingdom,0
2,540543,22645,CERAMIC HEART FAIRY CAKE MONEY BANK,8.0,2011-01-09 15:23:00,1.45,14395,United Kingdom,3
3,579001,23269,SET OF 2 CERAMIC CHRISTMAS TREES,2.0,2011-11-27 16:19:00,1.45,15555,United Kingdom,26
4,560120,23247,BISCUIT TIN 50'S CHRISTMAS,6.0,2011-07-15 10:27:00,2.89,14907,United Kingdom,3
...,...,...,...,...,...,...,...,...,...
86475,566077,23323,WHITE WICKER STAR,6.0,2011-09-09 09:38:00,2.10,13441,United Kingdom,0
86476,566077,23222,CHRISTMAS TREE HANGING GOLD,12.0,2011-09-09 09:38:00,0.83,13441,United Kingdom,0
86477,566077,22153,ANGEL DECORATION STARS ON DRESS,48.0,2011-09-09 09:38:00,0.42,13441,United Kingdom,0
86478,566077,85053,FRENCH ENAMEL CANDLEHOLDER,48.0,2011-09-09 09:38:00,1.69,13441,United Kingdom,0


In [32]:

numeric_features = ['Quantity','UnitPrice']
string_categorical_features = ['InvoiceNo','StockCode','Description', 'InvoiceDate', 'Country']
numeric_categorical_features = ['CustomerID','has_cancellation']

X[string_categorical_features] = X[string_categorical_features].astype(str)
X[numeric_categorical_features] = X[numeric_categorical_features].astype(int)
X[numeric_features] = X[numeric_features].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [33]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat_str', OneHotEncoder(handle_unknown='ignore'), string_categorical_features),
        ('cat_num', OneHotEncoder(handle_unknown='ignore', dtype=int), numeric_categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(random_state=42))])
pipeline.fit(X_train,y_train)

In [34]:
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_ = f1_score(y_test, y_pred)

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("LogisticRegression:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_)
print("ROC-AUC:", roc_auc)


LogisticRegression:
Accuracy: 0.8700855689176689
Precision: 0.8415873355439084
Recall: 0.9112809821635395
F1-score: 0.8750486570650058
ROC-AUC: 0.9425271864194581


In [35]:
encoder = OrdinalEncoder()
encoded_data = encoder.fit_transform(X)
encoded_data

array([[3.752e+03, 1.716e+03, 5.320e+02, ..., 3.696e+03, 3.500e+01,
        1.000e+00],
       [5.808e+03, 1.670e+03, 1.793e+03, ..., 2.254e+03, 3.500e+01,
        0.000e+00],
       [1.243e+03, 9.040e+02, 2.800e+02, ..., 1.443e+03, 3.500e+01,
        3.000e+00],
       ...,
       [9.367e+03, 5.470e+02, 8.900e+01, ..., 7.900e+02, 3.500e+01,
        0.000e+00],
       [9.367e+03, 1.787e+03, 6.110e+02, ..., 7.900e+02, 3.500e+01,
        0.000e+00],
       [9.367e+03, 7.570e+02, 7.420e+02, ..., 7.900e+02, 3.500e+01,
        0.000e+00]])

In [36]:

X_train, X_test, y_train, y_test = train_test_split(encoded_data, y, test_size=0.2, random_state=42)

alpha = 1.0

# Определение модели LASSO
lasso = Lasso(alpha=alpha, random_state=42)

# Создание пайплайна
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(estimator=lasso)),
    ('classifier', LogisticRegression(random_state=42))
])

# Обучение пайплайна
pipeline.fit(X_train, y_train)

# Получение отобранных признаков
selected_features = pipeline.named_steps['feature_selection'].get_support()

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Получение прогнозов для тестового набора данных
y_pred = pipeline.predict(X_test)

# Вычисление метрик оценки качества модели
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_ = f1_score(y_test, y_pred)

# Получение вероятностей прогнозов для класса 1 (положительного класса)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

# Вычисление ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Вывод результатов
print("LogisticRegression:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_)
print("ROC-AUC:", roc_auc)


LogisticRegression:
Accuracy: 0.7649745605920444
Precision: 0.749046113594244
Recall: 0.7958072735696086
F1-score: 0.771718986915258
ROC-AUC: 0.8371733560814567


In [38]:
X[string_categorical_features] = X[string_categorical_features].astype(str)
X[numeric_categorical_features] = X[numeric_categorical_features].astype(int)
X[numeric_features] = X[numeric_features].astype(float)

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Определение параметров для поиска
param_grid = {
    'classifier__C': [0.01, 0.1, 1.0],  # пример значений для параметра регуляризации модели
    'classifier__solver': ['liblinear', 'sag', 'lbfgs'],  # пример оптимизаторов модели
    'preprocessor__num__strategy': ['mean'],  # пример гиперпараметров преобразования данных
}

# Создание пайплайна
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat_str', OneHotEncoder(handle_unknown='ignore'), string_categorical_features),
        ('cat_num', OneHotEncoder(handle_unknown='ignore', dtype=int), numeric_categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(random_state=42))])

# Поиск оптимальных гиперпараметров
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

# Оценка производительности модели
y_pred = grid_search.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_ = f1_score(y_test, y_pred)

y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("Logistic Regression:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_)
print("ROC-AUC:", roc_auc)


Logistic Regression:
Accuracy: 0.9871068455134135
Precision: 0.9845604332296347
Recall: 0.9896919156821867
F1-score: 0.9871195055738462
ROC-AUC: 0.999317695362452


In [43]:
from sklearn.model_selection import cross_val_score
best_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_model, X_test, y_test, cv=5)
print("Cross-Validation Accuracy DT: {:.2f} (+/- {:.2f})".format(cv_scores.mean(), cv_scores.std() * 2))


Cross-Validation Accuracy DT: 0.82 (+/- 0.01)
