# Прогноз статуса заказа по данным Amazon Sales (2025)
источник - https://www.kaggle.com/datasets/zahidmughal2343/amazon-sales-2025/data

Columns Description:
Order ID - Unique identifier for each order (e.g., ORD0001).

Date - Date of the order.

Product - Name of the product purchased.

Category - Product category (Electronics, Clothing, Home Appliances, etc.).

Price - Price of a single unit of the product.

Quantity - Number of units purchased in the order.

Total Sales - Total revenue from the order (Price × Quantity).

Customer Name - Name of the customer.

Customer Location - City where the customer is based.

Payment Method - Mode of payment (Credit Card, Debit Card, PayPal, etc.).

Status - Order status (Completed, Pending, or Cancelled).


In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

In [165]:
# Загрузка данных
df = pd.read_csv('amazon_sales_data 2025.csv')

# Просмотр первых 5 строк
df.head()

Unnamed: 0,Order ID,Date,Product,Category,Price,Quantity,Total Sales,Customer Name,Customer Location,Payment Method,Status
0,ORD0001,14-03-25,Running Shoes,Footwear,60,3,180,Emma Clark,New York,Debit Card,Cancelled
1,ORD0002,20-03-25,Headphones,Electronics,100,4,400,Emily Johnson,San Francisco,Debit Card,Pending
2,ORD0003,15-02-25,Running Shoes,Footwear,60,2,120,John Doe,Denver,Amazon Pay,Cancelled
3,ORD0004,19-02-25,Running Shoes,Footwear,60,3,180,Olivia Wilson,Dallas,Credit Card,Pending
4,ORD0005,10-03-25,Smartwatch,Electronics,150,3,450,Emma Clark,New York,Debit Card,Pending


In [166]:
# Информация о датафрейме
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Order ID           250 non-null    object
 1   Date               250 non-null    object
 2   Product            250 non-null    object
 3   Category           250 non-null    object
 4   Price              250 non-null    int64 
 5   Quantity           250 non-null    int64 
 6   Total Sales        250 non-null    int64 
 7   Customer Name      250 non-null    object
 8   Customer Location  250 non-null    object
 9   Payment Method     250 non-null    object
 10  Status             250 non-null    object
dtypes: int64(3), object(8)
memory usage: 21.6+ KB


In [167]:
# Описательная статистика по числовым признакам
df.describe()

Unnamed: 0,Price,Quantity,Total Sales
count,250.0,250.0,250.0
mean,343.58,2.856,975.38
std,380.635808,1.429489,1252.112254
min,15.0,1.0,15.0
25%,40.0,2.0,100.0
50%,150.0,3.0,400.0
75%,600.0,4.0,1500.0
max,1200.0,5.0,6000.0


In [168]:
# Проверим наличие пропусков
df.isnull().sum()

Order ID             0
Date                 0
Product              0
Category             0
Price                0
Quantity             0
Total Sales          0
Customer Name        0
Customer Location    0
Payment Method       0
Status               0
dtype: int64

In [169]:
df = df.drop(columns=['Order ID', 'Date', 'Product', 'Category', 'Customer Name', 'Customer Location', 'Payment Method'])
df.head()

Unnamed: 0,Price,Quantity,Total Sales,Status
0,60,3,180,Cancelled
1,100,4,400,Pending
2,60,2,120,Cancelled
3,60,3,180,Pending
4,150,3,450,Pending


In [170]:
X = df.drop('Status', axis=1)
print("Используемые признаки:", X.columns.tolist())
y = df['Status']

Используемые признаки: ['Price', 'Quantity', 'Total Sales']


In [171]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [172]:
# Масштабируем данные перед обучением SGDClassifier
scaler_sgd = StandardScaler()
X_train_scaled_sgd = scaler_sgd.fit_transform(X_train)
X_test_scaled_sgd = scaler_sgd.transform(X_test)

sgd_model = SGDClassifier(alpha=0.001, random_state=42, max_iter=1000, tol=1e-3)
sgd_model.fit(X_train_scaled_sgd, y_train)
y_pred_sgd = sgd_model.predict(X_test_scaled_sgd)

In [173]:
cm_sgd = confusion_matrix(y_test, y_pred_sgd)
print("Матрица ошибок SGDClassifier:")
print(cm_sgd)

accuracy_sgd = accuracy_score(y_test, y_pred_sgd)
print(f"Accuracy SGDClassifier: {accuracy_sgd:.2f}")

Матрица ошибок SGDClassifier:
[[ 0 12 11]
 [ 0 11 15]
 [ 0 10 16]]
Accuracy SGDClassifier: 0.36


In [174]:
# Тренировка модели Logistic Regression
logreg_model = LogisticRegression(max_iter=2000, random_state=42)
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)

In [175]:
cm_logreg = confusion_matrix(y_test, y_pred_logreg)
print("Матрица ошибок LogisticRegression:")
print(cm_logreg)


accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy LogisticRegression: {accuracy_logreg:.2f}")

Матрица ошибок LogisticRegression:
[[ 5 12  6]
 [ 6 13  7]
 [ 3 14  9]]
Accuracy LogisticRegression: 0.36
