In [17]:
# Импорт библиотек
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.utils import to_categorical
import os
import struct
import xgboost as xgb

In [2]:
# Загрузка и подготовка данных
# Online Retail Dataset
df = pd.read_excel('data1/Online Retail.xlsx', sheet_name='Online Retail')

# Просмотр первых строк данных
print(df.head())

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


In [11]:
def load_mnist_images(filename):
    with open(filename, 'rb') as f:
        magic, num_images, rows, cols = struct.unpack('>IIII', f.read(16))
        images = np.fromfile(f, dtype=np.uint8).reshape(num_images, rows, cols)
    return images

def load_mnist_labels(filename):
    with open(filename, 'rb') as f:
        magic, num_labels = struct.unpack('>II', f.read(8))
        labels = np.fromfile(f, dtype=np.uint8)
    return labels

# Путь к директории с данными
data_dir = 'data2'

# Загрузка данных
x_train = load_mnist_images(os.path.join(data_dir, 'train-images-idx3-ubyte'))
y_train = load_mnist_labels(os.path.join(data_dir, 'train-labels-idx1-ubyte'))
x_test = load_mnist_images(os.path.join(data_dir, 't10k-images-idx3-ubyte'))
y_test = load_mnist_labels(os.path.join(data_dir, 't10k-labels-idx1-ubyte'))

# Нормализация данных
x_train, x_test = x_train / 255.0, x_test / 255.0

# Просмотр размеров данных
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: (60000, 28, 28)
y_train shape: (60000,)
x_test shape: (10000, 28, 28)
y_test shape: (10000,)


In [12]:
# Создание модели
model_cnn = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

model_cnn.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])


In [13]:
# Обучение модели
model_cnn.fit(x_train[..., np.newaxis], y_train, epochs=5, validation_split=0.1)

Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 10ms/step - accuracy: 0.8060 - loss: 0.5442 - val_accuracy: 0.8973 - val_loss: 0.2886
Epoch 2/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 13ms/step - accuracy: 0.9006 - loss: 0.2725 - val_accuracy: 0.9022 - val_loss: 0.2715
Epoch 3/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step - accuracy: 0.9173 - loss: 0.2244 - val_accuracy: 0.9043 - val_loss: 0.2569
Epoch 4/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.9310 - loss: 0.1890 - val_accuracy: 0.9120 - val_loss: 0.2491
Epoch 5/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.9409 - loss: 0.1618 - val_accuracy: 0.9105 - val_loss: 0.2541


<keras.src.callbacks.history.History at 0x14063dc10>

In [15]:
# Оценка модели
test_loss, test_acc = model_cnn.evaluate(x_test[..., np.newaxis], y_test)
print(f'\nTest accuracy: {test_acc}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9071 - loss: 0.2668

Test accuracy: 0.9078999757766724


### Подготовка данных датасета Online Retail

In [4]:
# Очистка данных
df.dropna(subset=['CustomerID'], inplace=True)  # Удаление строк без CustomerID
df = df[df['Quantity'] > 0]  # Удаление отрицательных значений Quantity

In [5]:
# Агрегация данных по месяцам и клиентам
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Month'] = df['InvoiceDate'].dt.to_period('M')
# 
monthly_sales = df.groupby(['Month', 'CustomerID'])['Quantity'].sum().reset_index()

In [6]:
# Пример агрегации данных
monthly_sales = df.groupby(['Month', 'CustomerID'])['Quantity'].sum().reset_index()

In [7]:
# Модель для прогнозирования поведения
# Создание целевого признака
X = monthly_sales[['Quantity']].values
y = (monthly_sales['Quantity'] > monthly_sales['Quantity'].median()).astype(int)  # Простая бинарная классификация
# Разделение данных на обучающие и тестовые выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Нормализация данных
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Обучение модели и оценка
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1278
           1       1.00      1.00      1.00      1333

    accuracy                           1.00      2611
   macro avg       1.00      1.00      1.00      2611
weighted avg       1.00      1.00      1.00      2611

[[1278    0]
 [   0 1333]]
