In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn import preprocessing
le = preprocessing.LabelEncoder()


In [2]:
PATH = './DataSet_w_NA.xlsx'
df = pd.read_excel(PATH, sheet_name="Испорченные факты")
df.dropna(inplace = True)
classes_dict = {0:"A",1: "B", 2: "C"}

In [3]:
df = pd.pivot_table(df, index='Факты.Товар ID', values = ['Продажи, шт', 'Продажи, руб', 'Маржинальная прибыль',
                                                             'Повторение заказа', 'Повторение товара'], 
                       aggfunc={'Продажи, шт': [np.median, np.sum],
                              'Продажи, руб': [np.median, np.sum],
                              'Повторение заказа': np.sum,
                              'Маржинальная прибыль': np.sum})

newname=df.columns.map('_'.join)
df.columns=newname
df=df.reset_index()

total_sale=df['Продажи, руб_sum'].sum()
df['Доля']=df['Продажи, руб_sum']/total_sale * 100
df = df.sort_values(by=('Продажи, руб_sum'), ascending=False)
df = df.assign(sum_d=df['Доля'].cumsum())

df.loc[(df['sum_d'] <= 80), 'ABC'] = 'A'
df.loc[(df['sum_d'] > 80) & (df['sum_d'] <= 95), 'ABC'] = 'B'
df.loc[(df['sum_d'] > 95), 'ABC'] = 'C'
le.fit(["A", "B", "C"])
df['class'] = le.transform(df['ABC'])

In [4]:
def Classifier_acc(df, features, classifier, scaler):
    X = df[features]
    Y = df['class']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state = 1)
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    classifier.fit(X_train_scaled, y_train)
    y_pred = classifier.predict(X_test_scaled)
    
    print(f'Score is {classifier.score(X_train_scaled, y_train)}')
    print(f"Accuracy is {accuracy_score(y_test, y_pred)*100} %")
def Class_predict(params, classifier, scaler):
    data_scaled = scaler.transform(params)
    class_ = classifier.predict(data_scaled)
    return classes_dict[class_[0]]
    

In [5]:
scaler = StandardScaler()
data = [[25000, 10, 0.05]]


rfc_model = RandomForestClassifier()
Classifier_acc(df, ['Продажи, руб_median', 'Продажи, шт_sum', 'Доля'], rfc_model, scaler)
print(f'Предсказанный класс rfc: {Class_predict(data, rfc_model, scaler)}')
log_model = LogisticRegression(max_iter=10000)
Classifier_acc(df, ['Продажи, руб_median', 'Продажи, шт_sum', 'Доля'], log_model, scaler)
print(f'Предсказанный класс log: {Class_predict(data, log_model, scaler)}')
svc_model = LinearSVC(dual=False)
Classifier_acc(df, ['Продажи, руб_median', 'Продажи, шт_sum', 'Доля'], svc_model, scaler)
print(f'Предсказанный класс svc_model: {Class_predict(data, rfc, scaler)}')

Score is 1.0
Accuracy is 99.46380697050938 %
Предсказанный класс rfc: B




In [6]:
df[(df["ABC"] == "C")]

Unnamed: 0,Факты.Товар ID,Маржинальная прибыль_sum,Повторение заказа_sum,"Продажи, руб_median","Продажи, руб_sum","Продажи, шт_median","Продажи, шт_sum",Доля,sum_d,ABC,class
1674,PROD0001702,74947.80,1,162930.0,162930.0,2.0,2.0,0.014187,95.002735,C,2
1542,PROD0001565,29160.00,22,27000.0,162000.0,3.0,18.0,0.014106,95.016841,C,2
1857,PROD0001890,18223.65,3,161988.0,161988.0,3.0,3.0,0.014105,95.030946,C,2
927,PROD0000936,-40583.80,3,30980.0,161096.0,4.0,14.0,0.014027,95.044973,C,2
1350,PROD0001367,53088.00,11,26544.0,160923.0,3.5,20.0,0.014012,95.058985,C,2
...,...,...,...,...,...,...,...,...,...,...,...
1838,PROD0001871,1578.50,1,3850.0,3850.0,2.0,2.0,0.000335,99.999089,C,2
1854,PROD0001887,1193.40,1,3536.0,3536.0,2.0,2.0,0.000308,99.999396,C,2
1747,PROD0001779,1555.20,3,3240.0,3240.0,1.0,1.0,0.000282,99.999679,C,2
1675,PROD0001703,1411.20,1,2880.0,2880.0,2.0,2.0,0.000251,99.999929,C,2


In [7]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.
